From b05a3c1b3db9b82dfe86cd0d6db7b91ff89bd928 Mon Sep 17 00:00:00 2001
From: frreiss <frreiss@us.ibm.com>
Date: Wed, 16 Oct 2019 11:49:28 -0700
Subject: [PATCH 001/442] Improve API documentation for WindowDataset op

---
 .../base_api/api_def_WindowDataset.pbtxt      | 39 +++++++++++++++----
 1 file changed, 32 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_WindowDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_WindowDataset.pbtxt
index 01387b75279..2e56f32cb2b 100644
--- a/tensorflow/core/api_def/base_api/api_def_WindowDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_WindowDataset.pbtxt
@@ -4,29 +4,54 @@ op {
   in_arg {
     name: "size"
     description: <<END
-A scalar representing the number of elements to accumulate in a window.
+A scalar representing the number of elements to accumulate in a window. It must
+be positive.
 END
   }
   in_arg {
     name: "shift"
     description: <<END
-A scalar representing the steps moving the sliding window forward in one
-iteration. It must be positive.
+A scalar representing the number of input elements by which the window moves
+on each iteration.  It must be positive.
 END
   }
   in_arg {
     name: "stride"
     description: <<END
 A scalar representing the stride of the input elements of the sliding window.
-It must be positive.
+It must be positive. A value of 1 means "retain every input element".
 END
   }
   in_arg {
     name: "drop_remainder"
     description: <<END
-A scalar representing whether a window should be dropped in case its size is
-smaller than desired.
+A scalar representing whether the last window in the dataset should be dropped 
+if its size is smaller than the value of `size`.
 END
   }
-  summary: "A dataset that creates window datasets from the input dataset."
+  summary: <<END
+  A dataset that groups the elements of its input dataset into fixed-sized
+  windows. Each window is returned as a separate dataset object.
+
+  The first element in the `k`th window will be element
+  ```
+  1 + (k-1) * shift
+  ```
+  of the input dataset.  In particular, the first element of the first window 
+  will always be the first element of the input dataset.  
+
+  If the `stride` parameter is greater than 1, then each window will skip
+  `(stride - 1)` input elements between each element that appears in the
+  window.  Output windows will still contain `size` elements regardless of
+  the value of `stride`.
+
+  If the underlying dataset returns elements in a deterministic order, then the
+  output of `WindowDataset` will be deterministic. Otherwise, the output of
+  this dataset will depend on the input order, but the contents of the windows 
+  will be consistent with a single ordering of the input dataset's elements.
+
+  If the input dataset contains multiple tensors per element, this dataset will
+  return multiple datasets for each window. Each returned dataset will
+  correspond to a single window of one of the tensors in the input dataset.
+END
 }

From 4c9ee36f03d9b01b4d8598905aa26bbf81b380b4 Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Mon, 18 Nov 2019 00:22:24 +0530
Subject: [PATCH 002/442] Update array_ops.py

Update documentation, formatting and fix typos for `tf.broadcast_dynamic_shape`, `tf.broadcast_static_shape`, `tf.boolean_mask`
---
 tensorflow/python/ops/array_ops.py | 112 +++++++++++++++++------------
 1 file changed, 65 insertions(+), 47 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index fd0c3b2ad1e..046000510a9 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -432,23 +432,31 @@ setdiff1d.__doc__ = gen_array_ops.list_diff.__doc__
 def broadcast_dynamic_shape(shape_x, shape_y):
   """Computes the shape of a broadcast given symbolic shapes.
 
-  When shape_x and shape_y are Tensors representing shapes (i.e. the result of
+  When `shape_x` and `shape_y` are Tensors representing shapes (i.e. the result of
   calling tf.shape on another Tensor) this computes a Tensor which is the shape
-  of the result of a broadcasting op applied in tensors of shapes shape_x and
-  shape_y.
-
-  For example, if shape_x is [1, 2, 3] and shape_y is [5, 1, 3], the result is a
-  Tensor whose value is [5, 2, 3].
+  of the result of a broadcasting op applied in tensors of shapes `shape_x` and
+  `shape_y`.
 
   This is useful when validating the result of a broadcasting operation when the
   tensors do not have statically known shapes.
 
+  Example:
+
+  >>> shape_x = [1, 2, 3]
+  >>> shape_y = [5, 1, 3]
+  >>> broadcast_dynamic_shape(shape_x, shape_y)
+  <tf.Tensor: id=..., shape=(3,), dtype=int32, numpy=array([5, 2, 3], dtype=int32)>
+
   Args:
     shape_x: A rank 1 integer `Tensor`, representing the shape of x.
     shape_y: A rank 1 integer `Tensor`, representing the shape of y.
 
   Returns:
     A rank 1 integer `Tensor` representing the broadcasted shape.
+
+  Raises:
+    InvalidArgumentError: If the two shapes are incompatible for
+    broadcasting.
   """
   return gen_array_ops.broadcast_args(shape_x, shape_y)
 
@@ -457,9 +465,9 @@ def broadcast_dynamic_shape(shape_x, shape_y):
 def broadcast_static_shape(shape_x, shape_y):
   """Computes the shape of a broadcast given known shapes.
 
-  When shape_x and shape_y are fully known TensorShapes this computes a
-  TensorShape which is the shape of the result of a broadcasting op applied in
-  tensors of shapes shape_x and shape_y.
+  When `shape_x` and `shape_y` are fully known `TensorShape`s this computes a
+  `TensorShape` which is the shape of the result of a broadcasting op applied in
+  tensors of shapes `shape_x` and `shape_y`.
 
   For example, if shape_x is [1, 2, 3] and shape_y is [5, 1, 3], the result is a
   TensorShape whose value is [5, 2, 3].
@@ -467,6 +475,13 @@ def broadcast_static_shape(shape_x, shape_y):
   This is useful when validating the result of a broadcasting operation when the
   tensors have statically known shapes.
 
+  Example:
+
+  >>> shape_x = tf.TensorShape([1, 2, 3])
+  >>> shape_y = tf.TensorShape([5, 1 ,3])
+  >>> broadcast_static_shape(shape_x, shape_y)
+  TensorShape([Dimension(5), Dimension(2), Dimension(3)])
+
   Args:
     shape_x: A `TensorShape`
     shape_y: A `TensorShape`
@@ -1523,13 +1538,6 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
 
   Numpy equivalent is `tensor[mask]`.
 
-  ```python
-  # 1-D example
-  tensor = [0, 1, 2, 3]
-  mask = np.array([True, False, True, False])
-  boolean_mask(tensor, mask)  # [0, 2]
-  ```
-
   In general, `0 < dim(mask) = K <= dim(tensor)`, and `mask`'s shape must match
   the first K dimensions of `tensor`'s shape.  We then have:
     `boolean_mask(tensor, mask)[i, j1,...,jd] = tensor[i1,...,iK,j1,...,jd]`
@@ -1542,9 +1550,23 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
   ragged tensors, and can be used if you need to preserve the masked dimensions
   of `tensor` (rather than flattening them, as `tf.boolean_mask` does).
 
+  Examples:
+
+  ```python
+  # 1-D example
+  tensor = [0, 1, 2, 3]
+  mask = np.array([True, False, True, False])
+  boolean_mask(tensor, mask)  # [0, 2]
+
+  # 2-D example
+  tensor = [[1, 2], [3, 4], [5, 6]]
+  mask = np.array([True, False, True])
+  boolean_mask(tensor, mask)  # [[1, 2], [5, 6]]
+  ```
+
   Args:
-    tensor:  N-D tensor.
-    mask:  K-D boolean tensor, K <= N and K must be known statically.
+    tensor:  N-D Tensor.
+    mask:  K-D boolean Tensor, K <= N and K must be known statically.
     name:  A name for this operation (optional).
     axis:  A 0-D int Tensor representing the axis in `tensor` to mask from. By
       default, axis is 0 which will mask from the first dimension. Otherwise K +
@@ -1556,15 +1578,6 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
 
   Raises:
     ValueError:  If shapes do not conform.
-
-  Examples:
-
-  ```python
-  # 2-D example
-  tensor = [[1, 2], [3, 4], [5, 6]]
-  mask = np.array([True, False, True])
-  boolean_mask(tensor, mask)  # [[1, 2], [5, 6]]
-  ```
   """
 
   def _apply_mask_1d(reshaped_tensor, mask, axis=None):
@@ -1611,13 +1624,6 @@ def boolean_mask_v2(tensor, mask, axis=None, name="boolean_mask"):
 
   Numpy equivalent is `tensor[mask]`.
 
-  ```python
-  # 1-D example
-  tensor = [0, 1, 2, 3]
-  mask = np.array([True, False, True, False])
-  boolean_mask(tensor, mask)  # [0, 2]
-  ```
-
   In general, `0 < dim(mask) = K <= dim(tensor)`, and `mask`'s shape must match
   the first K dimensions of `tensor`'s shape.  We then have:
     `boolean_mask(tensor, mask)[i, j1,...,jd] = tensor[i1,...,iK,j1,...,jd]`
@@ -1630,9 +1636,21 @@ def boolean_mask_v2(tensor, mask, axis=None, name="boolean_mask"):
   ragged tensors, and can be used if you need to preserve the masked dimensions
   of `tensor` (rather than flattening them, as `tf.boolean_mask` does).
 
+  Examples:
+
+  >>> tensor = [0, 1, 2, 3]  # 1-D example
+  >>> mask = np.array([True, False, True, False])
+  >>> boolean_mask(tensor, mask)
+  <tf.Tensor: id=..., shape=(2,), dtype=int32, numpy=array([0, 2], dtype=int32)>
+
+  >>> tensor = [[1, 2], [3, 4], [5, 6]] # 2-D example
+  >>> mask = np.array([True, False, True])
+  >>> boolean_mask(tensor, mask)
+  <tf.Tensor: id=..., shape=(2, 2), dtype=int32, numpy=array([[1, 2], [5, 6]], dtype=int32)>
+
   Args:
-    tensor:  N-D tensor.
-    mask:  K-D boolean tensor, K <= N and K must be known statically.
+    tensor:  N-D Tensor.
+    mask:  K-D boolean Tensor, K <= N and K must be known statically.
     axis:  A 0-D int Tensor representing the axis in `tensor` to mask from. By
       default, axis is 0 which will mask from the first dimension. Otherwise K +
       axis <= N.
@@ -3433,18 +3451,18 @@ def batch_to_space_v2(input, block_shape, crops, name=None):  # pylint: disable=
       This operation is equivalent to the following steps:
       1. Reshape `input` to `reshaped` of shape: [block_shape[0], ...,
         block_shape[M-1], batch / prod(block_shape), input_shape[1], ...,
-        input_shape[N-1]]  
-      2. Permute dimensions of `reshaped` to produce `permuted` of shape 
-         [batch / prod(block_shape),  input_shape[1], block_shape[0], ..., 
+        input_shape[N-1]]
+      2. Permute dimensions of `reshaped` to produce `permuted` of shape
+         [batch / prod(block_shape),  input_shape[1], block_shape[0], ...,
          input_shape[M], block_shape[M-1], input_shape[M+1],
-        ..., input_shape[N-1]]  
-      3. Reshape `permuted` to produce `reshaped_permuted` of shape 
-         [batch / prod(block_shape), input_shape[1] * block_shape[0], ..., 
-         input_shape[M] * block_shape[M-1], input_shape[M+1], ..., 
-         input_shape[N-1]]  
-      4. Crop the start and end of dimensions `[1, ..., M]` of 
-         `reshaped_permuted` according to `crops` to produce the output 
-         of shape: 
+        ..., input_shape[N-1]]
+      3. Reshape `permuted` to produce `reshaped_permuted` of shape
+         [batch / prod(block_shape), input_shape[1] * block_shape[0], ...,
+         input_shape[M] * block_shape[M-1], input_shape[M+1], ...,
+         input_shape[N-1]]
+      4. Crop the start and end of dimensions `[1, ..., M]` of
+         `reshaped_permuted` according to `crops` to produce the output
+         of shape:
          [batch / prod(block_shape),  input_shape[1] *
            block_shape[0] - crops[0,0] - crops[0,1], ..., input_shape[M] *
            block_shape[M-1] - crops[M-1,0] - crops[M-1,1],  input_shape[M+1],

From 2a6efd2e668f8418bdf1c60e8218791559724dc4 Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Mon, 18 Nov 2019 10:49:24 +0530
Subject: [PATCH 003/442] Update docstrings

Updated docstrings for `tf.convert_to_tensor` and `tf.edit_distance`.
`tf.convert_to_tensor`:  Put example in "For example:" section and switch to carets from backticks.
`tf.edit_distance`: Updated documentatoin, fixed example.
---
 tensorflow/python/framework/ops.py | 24 +++++------
 tensorflow/python/ops/array_ops.py | 68 +++++++++++++++++++-----------
 2 files changed, 55 insertions(+), 37 deletions(-)

diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 8a273e834be..5b95d9df7cd 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -1204,20 +1204,20 @@ def convert_to_tensor_v2(value, dtype=None, dtype_hint=None, name=None):
 
   This function converts Python objects of various types to `Tensor`
   objects. It accepts `Tensor` objects, numpy arrays, Python lists,
-  and Python scalars. For example:
+  and Python scalars.
 
-  ```python
-  import numpy as np
+  For example:
 
-  def my_func(arg):
-    arg = tf.convert_to_tensor(arg, dtype=tf.float32)
-    return tf.matmul(arg, arg) + arg
-
-  # The following calls are equivalent.
-  value_1 = my_func(tf.constant([[1.0, 2.0], [3.0, 4.0]]))
-  value_2 = my_func([[1.0, 2.0], [3.0, 4.0]])
-  value_3 = my_func(np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32))
-  ```
+  >>> import numpy as np
+  >>> def my_func(arg):
+  ...   arg = tf.convert_to_tensor(arg, dtype=tf.float32)
+  ...   return tf.matmul(arg, arg) + arg
+  ...
+  >>> # The following calls are equivalent.
+  ...
+  >>> value_1 = my_func(tf.constant([[1.0, 2.0], [3.0, 4.0]]))
+  >>> value_2 = my_func([[1.0, 2.0], [3.0, 4.0]])
+  >>> value_3 = my_func(np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32))
 
   This function can be useful when composing a new operation in Python
   (such as `my_func` in the example above). All standard Python op
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 046000510a9..6a18d08f22f 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -3157,41 +3157,59 @@ def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
   You can normalize the edit distance by length of `truth` by setting
   `normalize` to true.
 
-  For example, given the following input:
+  For example:
+
+  Given the following input,
+  * `hypothesis` is a `tf.SparseTensor` of shape `[2, 1, 1]`
+  * `truth` is a `tf.SparseTensor` of shape `[2, 2, 2]`
+
+  >>> hypothesis = tf.SparseTensor(
+  ...   [[0, 0, 0],
+  ...    [1, 0, 0]],
+  ...   ["a", "b"],
+  ...   (2, 1, 1))
+  >>> truth = tf.SparseTensor(
+  ...   [[0, 1, 0],
+  ...    [1, 0, 0],
+  ...    [1, 0, 1],
+  ...    [1, 1, 0]],
+  ...    ["a", "b", "c", "a"],
+  ...    (2, 2, 2))
+  >>> edit_distance(hypothesis, truth, normalize=True)
+  <tf.Tensor: id=..., shape=(2, 2), dtype=float32, numpy=
+  array([[inf, 1. ],
+         [0.5, 1. ]], dtype=float32)>
+
+  The operaton returns a dense Tensor of shape `[2, 2]` with
+  edit distances normalized by `truth` lengths.
+
+  **Note**: It is possible to calculate edit distance between two
+  sparse tensors with variable-length values. However, attempting to create
+  them while eager execution is enabled will result in a `ValueError`.
+
+  For the following  inputs,
 
   ```python
   # 'hypothesis' is a tensor of shape `[2, 1]` with variable-length values:
-  #   (0,0) = ["a"]
-  #   (1,0) = ["b"]
   hypothesis = tf.SparseTensor(
-      [[0, 0, 0],
-       [1, 0, 0]],
-      ["a", "b"],
-      (2, 1, 1))
+    [[0, 0],
+     [1,0]],
+    ["a", "b"],
+    (2, 1))
 
   # 'truth' is a tensor of shape `[2, 2]` with variable-length values:
-  #   (0,0) = []
-  #   (0,1) = ["a"]
-  #   (1,0) = ["b", "c"]
-  #   (1,1) = ["a"]
   truth = tf.SparseTensor(
-      [[0, 1, 0],
-       [1, 0, 0],
-       [1, 0, 1],
-       [1, 1, 0]],
-      ["a", "b", "c", "a"],
-      (2, 2, 2))
+    [[0, 1],
+     [1, 0],
+     [1, 1]],
+    ["a", ["b", "c"], "a"],
+    (2, 2))
 
   normalize = True
-  ```
 
-  This operation would return the following:
-
-  ```python
-  # 'output' is a tensor of shape `[2, 2]` with edit distances normalized
-  # by 'truth' lengths.
-  output ==> [[inf, 1.0],  # (0,0): no truth, (0,1): no hypothesis
-             [0.5, 1.0]]  # (1,0): addition, (1,1): no hypothesis
+  # The output would be a dense Tensor of shape `[2,]`, with edit distances
+  noramlized by 'truth' lengths.
+  # output => array([0., 0.5], dtype=float32)
   ```
 
   Args:

From fdadd0e5e524df6488cd763c4ab7595d469ed1ef Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Mon, 18 Nov 2019 23:04:25 +0530
Subject: [PATCH 004/442] Update save.py

Fix https://github.com/tensorflow/tensorflow/issues/34348 .
Notes:
- Documentation needs to be changed (in multiple places) after final changes in code.
- Changed code for deciding whether to save file as h5 or tf.
- Removed the unncessary _HDF5_EXTENSIONS list. Will have to make sure it wasn't used elsewhere.
- Added 4 new ValueError raises.
---
 tensorflow/python/keras/saving/save.py | 44 +++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py
index 4be3aa0bbda..9f7f5778afe 100644
--- a/tensorflow/python/keras/saving/save.py
+++ b/tensorflow/python/keras/saving/save.py
@@ -23,6 +23,7 @@ import os
 import six
 
 from tensorflow.python import tf2
+from tensorflow.python.keras.engine.network import _is_hdf5_filepath
 from tensorflow.python.keras.saving import hdf5_format
 from tensorflow.python.keras.saving.saved_model import load as saved_model_load
 from tensorflow.python.keras.saving.saved_model import save as saved_model_save
@@ -36,9 +37,6 @@ except ImportError:
   h5py = None
 # pylint: enable=g-import-not-at-top
 
-_HDF5_EXTENSIONS = ['.h5', '.hdf5', '.keras']
-
-
 # TODO(kathywu): Remove this when Keras SavedModel is not experimental.
 _KERAS_SAVED_MODEL_STILL_EXPERIMENTAL = True
 
@@ -92,12 +90,42 @@ def save_model(model,
   """
   from tensorflow.python.keras.engine import sequential  # pylint: disable=g-import-not-at-top
 
-  default_format = 'tf' if tf2.enabled() else 'h5'
-  save_format = save_format or default_format
+  if type(filepath) != str and not isinstance(filepath, h5py.File):
+      raise ValueError(
+          'Expected `filepath` to be a String or `h5py.File` object. Got'
+          'unsupported value %s of type %s'
+          % (filepath, type(filepath)))
 
-  if (save_format == 'h5' or
-      (h5py is not None and isinstance(filepath, h5py.File)) or
-      os.path.splitext(filepath)[1] in _HDF5_EXTENSIONS):
+  filepath_is_h5py_file = h5py is not None and isinstance(filepath, h5py.File)
+  filepath_is_h5 = type(filepath) == str and _is_hdf5_filepath(filepath)
+  if save_format is None:
+    if (filepath_is_h5 or
+        (filepath_is_h5py_file)):
+      save_format = 'h5'
+    else:
+      save_format = 'tf' if tf2.enabled() else 'h5'
+  else:
+    user_format = save_format.lower().strip()
+    if user_format in ('tensorflow', 'tf'):
+      save_format = 'tf'
+    elif user_format in ('hdf5', 'h5', 'keras'):
+      save_format = 'h5'
+    else:
+      raise ValueError(
+          'Unknown format "%s". Was expecting one of {"tf", "h5"}.' % (
+          save_format,))
+  if save_format == 'tf' and filepath_is_h5:
+    raise ValueError(
+      ('`save` got save_format="tf"/"tensorflow", but the '
+       'filepath ("%s") looks like an HDF5 file. Omit the ".h5"/".keras" '
+       'when saving in TensorFlow format.')
+      % filepath)
+  if save_format == 'tf' and filepath_is_h5py_file:
+    raise ValueError(
+        '`save` got save_format="tf"/"tensorflow", but the given `filepath`'
+        'is an `h5py.File` object.')
+
+  if save_format == 'h5':
     # TODO(b/130258301): add utility method for detecting model type.
     if (not model._is_graph_network and  # pylint:disable=protected-access
         not isinstance(model, sequential.Sequential)):

From b33be57b2b02b1abc159edc44155b46f0bf26cad Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Mon, 18 Nov 2019 23:43:27 +0530
Subject: [PATCH 005/442] Revert "Update docstrings"

This reverts commit 2a6efd2e668f8418bdf1c60e8218791559724dc4.
---
 tensorflow/python/framework/ops.py | 24 +++++------
 tensorflow/python/ops/array_ops.py | 68 +++++++++++-------------------
 2 files changed, 37 insertions(+), 55 deletions(-)

diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 5b95d9df7cd..8a273e834be 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -1204,20 +1204,20 @@ def convert_to_tensor_v2(value, dtype=None, dtype_hint=None, name=None):
 
   This function converts Python objects of various types to `Tensor`
   objects. It accepts `Tensor` objects, numpy arrays, Python lists,
-  and Python scalars.
+  and Python scalars. For example:
 
-  For example:
+  ```python
+  import numpy as np
 
-  >>> import numpy as np
-  >>> def my_func(arg):
-  ...   arg = tf.convert_to_tensor(arg, dtype=tf.float32)
-  ...   return tf.matmul(arg, arg) + arg
-  ...
-  >>> # The following calls are equivalent.
-  ...
-  >>> value_1 = my_func(tf.constant([[1.0, 2.0], [3.0, 4.0]]))
-  >>> value_2 = my_func([[1.0, 2.0], [3.0, 4.0]])
-  >>> value_3 = my_func(np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32))
+  def my_func(arg):
+    arg = tf.convert_to_tensor(arg, dtype=tf.float32)
+    return tf.matmul(arg, arg) + arg
+
+  # The following calls are equivalent.
+  value_1 = my_func(tf.constant([[1.0, 2.0], [3.0, 4.0]]))
+  value_2 = my_func([[1.0, 2.0], [3.0, 4.0]])
+  value_3 = my_func(np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32))
+  ```
 
   This function can be useful when composing a new operation in Python
   (such as `my_func` in the example above). All standard Python op
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 6a18d08f22f..046000510a9 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -3157,59 +3157,41 @@ def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
   You can normalize the edit distance by length of `truth` by setting
   `normalize` to true.
 
-  For example:
-
-  Given the following input,
-  * `hypothesis` is a `tf.SparseTensor` of shape `[2, 1, 1]`
-  * `truth` is a `tf.SparseTensor` of shape `[2, 2, 2]`
-
-  >>> hypothesis = tf.SparseTensor(
-  ...   [[0, 0, 0],
-  ...    [1, 0, 0]],
-  ...   ["a", "b"],
-  ...   (2, 1, 1))
-  >>> truth = tf.SparseTensor(
-  ...   [[0, 1, 0],
-  ...    [1, 0, 0],
-  ...    [1, 0, 1],
-  ...    [1, 1, 0]],
-  ...    ["a", "b", "c", "a"],
-  ...    (2, 2, 2))
-  >>> edit_distance(hypothesis, truth, normalize=True)
-  <tf.Tensor: id=..., shape=(2, 2), dtype=float32, numpy=
-  array([[inf, 1. ],
-         [0.5, 1. ]], dtype=float32)>
-
-  The operaton returns a dense Tensor of shape `[2, 2]` with
-  edit distances normalized by `truth` lengths.
-
-  **Note**: It is possible to calculate edit distance between two
-  sparse tensors with variable-length values. However, attempting to create
-  them while eager execution is enabled will result in a `ValueError`.
-
-  For the following  inputs,
+  For example, given the following input:
 
   ```python
   # 'hypothesis' is a tensor of shape `[2, 1]` with variable-length values:
+  #   (0,0) = ["a"]
+  #   (1,0) = ["b"]
   hypothesis = tf.SparseTensor(
-    [[0, 0],
-     [1,0]],
-    ["a", "b"],
-    (2, 1))
+      [[0, 0, 0],
+       [1, 0, 0]],
+      ["a", "b"],
+      (2, 1, 1))
 
   # 'truth' is a tensor of shape `[2, 2]` with variable-length values:
+  #   (0,0) = []
+  #   (0,1) = ["a"]
+  #   (1,0) = ["b", "c"]
+  #   (1,1) = ["a"]
   truth = tf.SparseTensor(
-    [[0, 1],
-     [1, 0],
-     [1, 1]],
-    ["a", ["b", "c"], "a"],
-    (2, 2))
+      [[0, 1, 0],
+       [1, 0, 0],
+       [1, 0, 1],
+       [1, 1, 0]],
+      ["a", "b", "c", "a"],
+      (2, 2, 2))
 
   normalize = True
+  ```
 
-  # The output would be a dense Tensor of shape `[2,]`, with edit distances
-  noramlized by 'truth' lengths.
-  # output => array([0., 0.5], dtype=float32)
+  This operation would return the following:
+
+  ```python
+  # 'output' is a tensor of shape `[2, 2]` with edit distances normalized
+  # by 'truth' lengths.
+  output ==> [[inf, 1.0],  # (0,0): no truth, (0,1): no hypothesis
+             [0.5, 1.0]]  # (1,0): addition, (1,1): no hypothesis
   ```
 
   Args:

From e81b7ea8d85bbedf9a0d2d00557400987975373f Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Mon, 18 Nov 2019 23:43:34 +0530
Subject: [PATCH 006/442] Revert "Update array_ops.py"

This reverts commit 4c9ee36f03d9b01b4d8598905aa26bbf81b380b4.
---
 tensorflow/python/ops/array_ops.py | 112 ++++++++++++-----------------
 1 file changed, 47 insertions(+), 65 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 046000510a9..fd0c3b2ad1e 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -432,31 +432,23 @@ setdiff1d.__doc__ = gen_array_ops.list_diff.__doc__
 def broadcast_dynamic_shape(shape_x, shape_y):
   """Computes the shape of a broadcast given symbolic shapes.
 
-  When `shape_x` and `shape_y` are Tensors representing shapes (i.e. the result of
+  When shape_x and shape_y are Tensors representing shapes (i.e. the result of
   calling tf.shape on another Tensor) this computes a Tensor which is the shape
-  of the result of a broadcasting op applied in tensors of shapes `shape_x` and
-  `shape_y`.
+  of the result of a broadcasting op applied in tensors of shapes shape_x and
+  shape_y.
+
+  For example, if shape_x is [1, 2, 3] and shape_y is [5, 1, 3], the result is a
+  Tensor whose value is [5, 2, 3].
 
   This is useful when validating the result of a broadcasting operation when the
   tensors do not have statically known shapes.
 
-  Example:
-
-  >>> shape_x = [1, 2, 3]
-  >>> shape_y = [5, 1, 3]
-  >>> broadcast_dynamic_shape(shape_x, shape_y)
-  <tf.Tensor: id=..., shape=(3,), dtype=int32, numpy=array([5, 2, 3], dtype=int32)>
-
   Args:
     shape_x: A rank 1 integer `Tensor`, representing the shape of x.
     shape_y: A rank 1 integer `Tensor`, representing the shape of y.
 
   Returns:
     A rank 1 integer `Tensor` representing the broadcasted shape.
-
-  Raises:
-    InvalidArgumentError: If the two shapes are incompatible for
-    broadcasting.
   """
   return gen_array_ops.broadcast_args(shape_x, shape_y)
 
@@ -465,9 +457,9 @@ def broadcast_dynamic_shape(shape_x, shape_y):
 def broadcast_static_shape(shape_x, shape_y):
   """Computes the shape of a broadcast given known shapes.
 
-  When `shape_x` and `shape_y` are fully known `TensorShape`s this computes a
-  `TensorShape` which is the shape of the result of a broadcasting op applied in
-  tensors of shapes `shape_x` and `shape_y`.
+  When shape_x and shape_y are fully known TensorShapes this computes a
+  TensorShape which is the shape of the result of a broadcasting op applied in
+  tensors of shapes shape_x and shape_y.
 
   For example, if shape_x is [1, 2, 3] and shape_y is [5, 1, 3], the result is a
   TensorShape whose value is [5, 2, 3].
@@ -475,13 +467,6 @@ def broadcast_static_shape(shape_x, shape_y):
   This is useful when validating the result of a broadcasting operation when the
   tensors have statically known shapes.
 
-  Example:
-
-  >>> shape_x = tf.TensorShape([1, 2, 3])
-  >>> shape_y = tf.TensorShape([5, 1 ,3])
-  >>> broadcast_static_shape(shape_x, shape_y)
-  TensorShape([Dimension(5), Dimension(2), Dimension(3)])
-
   Args:
     shape_x: A `TensorShape`
     shape_y: A `TensorShape`
@@ -1538,6 +1523,13 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
 
   Numpy equivalent is `tensor[mask]`.
 
+  ```python
+  # 1-D example
+  tensor = [0, 1, 2, 3]
+  mask = np.array([True, False, True, False])
+  boolean_mask(tensor, mask)  # [0, 2]
+  ```
+
   In general, `0 < dim(mask) = K <= dim(tensor)`, and `mask`'s shape must match
   the first K dimensions of `tensor`'s shape.  We then have:
     `boolean_mask(tensor, mask)[i, j1,...,jd] = tensor[i1,...,iK,j1,...,jd]`
@@ -1550,23 +1542,9 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
   ragged tensors, and can be used if you need to preserve the masked dimensions
   of `tensor` (rather than flattening them, as `tf.boolean_mask` does).
 
-  Examples:
-
-  ```python
-  # 1-D example
-  tensor = [0, 1, 2, 3]
-  mask = np.array([True, False, True, False])
-  boolean_mask(tensor, mask)  # [0, 2]
-
-  # 2-D example
-  tensor = [[1, 2], [3, 4], [5, 6]]
-  mask = np.array([True, False, True])
-  boolean_mask(tensor, mask)  # [[1, 2], [5, 6]]
-  ```
-
   Args:
-    tensor:  N-D Tensor.
-    mask:  K-D boolean Tensor, K <= N and K must be known statically.
+    tensor:  N-D tensor.
+    mask:  K-D boolean tensor, K <= N and K must be known statically.
     name:  A name for this operation (optional).
     axis:  A 0-D int Tensor representing the axis in `tensor` to mask from. By
       default, axis is 0 which will mask from the first dimension. Otherwise K +
@@ -1578,6 +1556,15 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
 
   Raises:
     ValueError:  If shapes do not conform.
+
+  Examples:
+
+  ```python
+  # 2-D example
+  tensor = [[1, 2], [3, 4], [5, 6]]
+  mask = np.array([True, False, True])
+  boolean_mask(tensor, mask)  # [[1, 2], [5, 6]]
+  ```
   """
 
   def _apply_mask_1d(reshaped_tensor, mask, axis=None):
@@ -1624,6 +1611,13 @@ def boolean_mask_v2(tensor, mask, axis=None, name="boolean_mask"):
 
   Numpy equivalent is `tensor[mask]`.
 
+  ```python
+  # 1-D example
+  tensor = [0, 1, 2, 3]
+  mask = np.array([True, False, True, False])
+  boolean_mask(tensor, mask)  # [0, 2]
+  ```
+
   In general, `0 < dim(mask) = K <= dim(tensor)`, and `mask`'s shape must match
   the first K dimensions of `tensor`'s shape.  We then have:
     `boolean_mask(tensor, mask)[i, j1,...,jd] = tensor[i1,...,iK,j1,...,jd]`
@@ -1636,21 +1630,9 @@ def boolean_mask_v2(tensor, mask, axis=None, name="boolean_mask"):
   ragged tensors, and can be used if you need to preserve the masked dimensions
   of `tensor` (rather than flattening them, as `tf.boolean_mask` does).
 
-  Examples:
-
-  >>> tensor = [0, 1, 2, 3]  # 1-D example
-  >>> mask = np.array([True, False, True, False])
-  >>> boolean_mask(tensor, mask)
-  <tf.Tensor: id=..., shape=(2,), dtype=int32, numpy=array([0, 2], dtype=int32)>
-
-  >>> tensor = [[1, 2], [3, 4], [5, 6]] # 2-D example
-  >>> mask = np.array([True, False, True])
-  >>> boolean_mask(tensor, mask)
-  <tf.Tensor: id=..., shape=(2, 2), dtype=int32, numpy=array([[1, 2], [5, 6]], dtype=int32)>
-
   Args:
-    tensor:  N-D Tensor.
-    mask:  K-D boolean Tensor, K <= N and K must be known statically.
+    tensor:  N-D tensor.
+    mask:  K-D boolean tensor, K <= N and K must be known statically.
     axis:  A 0-D int Tensor representing the axis in `tensor` to mask from. By
       default, axis is 0 which will mask from the first dimension. Otherwise K +
       axis <= N.
@@ -3451,18 +3433,18 @@ def batch_to_space_v2(input, block_shape, crops, name=None):  # pylint: disable=
       This operation is equivalent to the following steps:
       1. Reshape `input` to `reshaped` of shape: [block_shape[0], ...,
         block_shape[M-1], batch / prod(block_shape), input_shape[1], ...,
-        input_shape[N-1]]
-      2. Permute dimensions of `reshaped` to produce `permuted` of shape
-         [batch / prod(block_shape),  input_shape[1], block_shape[0], ...,
+        input_shape[N-1]]  
+      2. Permute dimensions of `reshaped` to produce `permuted` of shape 
+         [batch / prod(block_shape),  input_shape[1], block_shape[0], ..., 
          input_shape[M], block_shape[M-1], input_shape[M+1],
-        ..., input_shape[N-1]]
-      3. Reshape `permuted` to produce `reshaped_permuted` of shape
-         [batch / prod(block_shape), input_shape[1] * block_shape[0], ...,
-         input_shape[M] * block_shape[M-1], input_shape[M+1], ...,
-         input_shape[N-1]]
-      4. Crop the start and end of dimensions `[1, ..., M]` of
-         `reshaped_permuted` according to `crops` to produce the output
-         of shape:
+        ..., input_shape[N-1]]  
+      3. Reshape `permuted` to produce `reshaped_permuted` of shape 
+         [batch / prod(block_shape), input_shape[1] * block_shape[0], ..., 
+         input_shape[M] * block_shape[M-1], input_shape[M+1], ..., 
+         input_shape[N-1]]  
+      4. Crop the start and end of dimensions `[1, ..., M]` of 
+         `reshaped_permuted` according to `crops` to produce the output 
+         of shape: 
          [batch / prod(block_shape),  input_shape[1] *
            block_shape[0] - crops[0,0] - crops[0,1], ..., input_shape[M] *
            block_shape[M-1] - crops[M-1,0] - crops[M-1,1],  input_shape[M+1],

From 9c83a0e9a205a062d7c19a7fba175729c66ab13c Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Tue, 19 Nov 2019 12:31:26 +0530
Subject: [PATCH 007/442] Added new function process_save_format

- Added new function `validate_save_format` as requested by @k-w-w inside `network.py`.
- Using `validate_save_format` for validating save_format in `save.save_model` and `network.save_weights`

Although, the a few updates will have to be made in `save_weights` because
- `validate_save_format` is designed to work with path as well as h5py.File objects. This works with `save.save_model` but not with `network.save_weights` which accepts only String as the path.
- Does it make sense to add functionality to save_weights to save it to a h5py.File object?
---
 tensorflow/python/keras/engine/network.py | 90 +++++++++++++++++------
 tensorflow/python/keras/saving/save.py    | 31 +-------
 2 files changed, 70 insertions(+), 51 deletions(-)

diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 8b8bbd902fd..bc83c7f3e7b 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -30,6 +30,7 @@ import numpy as np
 import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
@@ -1067,28 +1068,7 @@ class Network(base_layer.Layer):
         ValueError: For invalid/unknown format arguments.
     """
     self._assert_weights_created()
-    filepath_is_h5 = _is_hdf5_filepath(filepath)
-    if save_format is None:
-      if filepath_is_h5:
-        save_format = 'h5'
-      else:
-        save_format = 'tf'
-    else:
-      user_format = save_format.lower().strip()
-      if user_format in ('tensorflow', 'tf'):
-        save_format = 'tf'
-      elif user_format in ('hdf5', 'h5', 'keras'):
-        save_format = 'h5'
-      else:
-        raise ValueError(
-            'Unknown format "%s". Was expecting one of {"tf", "h5"}.' % (
-                save_format,))
-    if save_format == 'tf' and filepath_is_h5:
-      raise ValueError(
-          ('save_weights got save_format="tf"/"tensorflow", but the '
-           'filepath ("%s") looks like an HDF5 file. Omit the ".h5"/".keras" '
-           'when saving in TensorFlow format.')
-          % filepath)
+    save_format = validate_save_format(filepath, save_format)
 
     if save_format == 'h5' and h5py is None:
       raise ImportError(
@@ -2029,3 +2009,69 @@ def get_network_config(network, serialize_layer_fn=None):
   model_outputs = tf_utils.convert_inner_node_data(model_outputs)
   config['output_layers'] = model_outputs
   return config
+
+
+def validate_save_format(filepath, save_format):
+  """Validates `save_format` argument passed to methods used for saving.
+
+  Returns either 'tf' or 'h5', indicating whether to save the model
+  to Tensorflow SavedModel or HDF5. Output will default to 'tf' in TF2.X and
+  'h5' in TF1.X.
+
+  Defaults to 'h5' if `filepath` is a path to a hdf5 file (having suffix '.h5' or
+  '.hdf5' or '.keras') or is an h5py.File object.
+
+  Args:
+    filepath: Value of the `filepath` argument passed to the method.
+      Can be:
+        - String
+        - h5py.File object
+    save_format: String, value of the 'save_format' argument as passed.
+
+  Returns:
+    save_format: String, 'h5' or 'tf'. The processed
+    value of the `save_format` argument.
+
+  Raises:
+    ValueError: If
+      - `filepath` is not a String or an h5py.File object.
+      - `save_format` is not valid. Valid values are "tensorflow", "tf" for
+        saving in SavedModel format, and "hdf5", "keras" or "h5" for saving in
+        h5 format.
+      - `save_format` is "tf" but `filepath` is a path to a h5 file.
+      - `save_format` is "tf" but `filepath` is an h5py.File object.
+  """
+  if type(filepath) != str and not isinstance(filepath, h5py.File):
+      raise ValueError(
+          'Expected `filepath` to be a String or h5py.File object. Got'
+          'unsupported value %s of type %s'
+          % (filepath, type(filepath)))
+
+  filepath_is_h5py_file = h5py is not None and isinstance(filepath, h5py.File)
+  filepath_is_h5 = type(filepath) == str and _is_hdf5_filepath(filepath)
+  if save_format is None:
+    if filepath_is_h5 or filepath_is_h5py_file:
+        save_format = 'h5'
+      else:
+        save_format = 'tf' if tf2.enabled() else 'h5'
+  else:
+    user_format = save_format.lower().strip()
+    if user_format in ('tensorflow', 'tf'):
+      save_format = 'tf'
+    elif user_format in ('hdf5', 'h5', 'keras'):
+      save_format = 'h5'
+    else:
+      raise ValueError(
+          'Unknown format "%s". Was expecting one of {"tf", "h5"}.' % (
+          save_format,))
+  if save_format == 'tf' and filepath_is_h5:
+    raise ValueError(
+        ('Got save_format="tf"/"tensorflow", but the filepath ("%s") looks '
+        'like an HDF5 file. Omit the ".h5"/".keras" when saving in '
+        'TensorFlow format.')
+        % filepath)
+  if save_format == 'tf' and filepath_is_h5py_file:
+    raise ValueError(
+        'Got save_format="tf"/"tensorflow", but the given `filepath`'
+        'is an h5py.File object.')
+  return save_format
diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py
index 9f7f5778afe..91ee00dbaec 100644
--- a/tensorflow/python/keras/saving/save.py
+++ b/tensorflow/python/keras/saving/save.py
@@ -23,7 +23,7 @@ import os
 import six
 
 from tensorflow.python import tf2
-from tensorflow.python.keras.engine.network import _is_hdf5_filepath
+from tensorflow.python.keras.engine import network
 from tensorflow.python.keras.saving import hdf5_format
 from tensorflow.python.keras.saving.saved_model import load as saved_model_load
 from tensorflow.python.keras.saving.saved_model import save as saved_model_save
@@ -96,34 +96,7 @@ def save_model(model,
           'unsupported value %s of type %s'
           % (filepath, type(filepath)))
 
-  filepath_is_h5py_file = h5py is not None and isinstance(filepath, h5py.File)
-  filepath_is_h5 = type(filepath) == str and _is_hdf5_filepath(filepath)
-  if save_format is None:
-    if (filepath_is_h5 or
-        (filepath_is_h5py_file)):
-      save_format = 'h5'
-    else:
-      save_format = 'tf' if tf2.enabled() else 'h5'
-  else:
-    user_format = save_format.lower().strip()
-    if user_format in ('tensorflow', 'tf'):
-      save_format = 'tf'
-    elif user_format in ('hdf5', 'h5', 'keras'):
-      save_format = 'h5'
-    else:
-      raise ValueError(
-          'Unknown format "%s". Was expecting one of {"tf", "h5"}.' % (
-          save_format,))
-  if save_format == 'tf' and filepath_is_h5:
-    raise ValueError(
-      ('`save` got save_format="tf"/"tensorflow", but the '
-       'filepath ("%s") looks like an HDF5 file. Omit the ".h5"/".keras" '
-       'when saving in TensorFlow format.')
-      % filepath)
-  if save_format == 'tf' and filepath_is_h5py_file:
-    raise ValueError(
-        '`save` got save_format="tf"/"tensorflow", but the given `filepath`'
-        'is an `h5py.File` object.')
+  save_format = network.validate_save_format(filepath, save_format)
 
   if save_format == 'h5':
     # TODO(b/130258301): add utility method for detecting model type.

From 68c034cde3b887943e30a644f618369745b04e56 Mon Sep 17 00:00:00 2001
From: frreiss <frreiss@us.ibm.com>
Date: Wed, 20 Nov 2019 11:43:31 -0800
Subject: [PATCH 008/442] Update python API docs per review comments

---
 .../base_api/api_def_WindowDataset.pbtxt      | 68 +++++++++++++------
 tensorflow/python/data/ops/dataset_ops.py     | 37 +++++++---
 2 files changed, 77 insertions(+), 28 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_WindowDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_WindowDataset.pbtxt
index 2e56f32cb2b..d3f00dff113 100644
--- a/tensorflow/core/api_def/base_api/api_def_WindowDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_WindowDataset.pbtxt
@@ -4,54 +4,82 @@ op {
   in_arg {
     name: "size"
     description: <<END
-A scalar representing the number of elements to accumulate in a window. It must
-be positive.
+A `tf.int64` scalar `tf.Tensor`, representing the number of elements
+of the input dataset to combine into a window. Must be positive.
 END
   }
   in_arg {
     name: "shift"
     description: <<END
-A scalar representing the number of input elements by which the window moves
-on each iteration.  It must be positive.
+A `tf.int64` scalar `tf.Tensor`, representing the number of input elements
+by which the window moves in each iteration.  Defaults to `size`.
+Must be positive.
 END
   }
   in_arg {
     name: "stride"
     description: <<END
-A scalar representing the stride of the input elements of the sliding window.
-It must be positive. A value of 1 means "retain every input element".
+A `tf.int64` scalar `tf.Tensor`, representing the stride of the input elements
+in the sliding window. Must be positive. The default value of 1 means
+"retain every input element".
 END
   }
   in_arg {
     name: "drop_remainder"
     description: <<END
-A scalar representing whether the last window in the dataset should be dropped 
-if its size is smaller than the value of `size`.
+A `tf.bool` scalar `tf.Tensor`, representing whether the last window should be
+dropped if its size is smaller than `window_size`.
 END
   }
   summary: <<END
-  A dataset that groups the elements of its input dataset into fixed-sized
-  windows. Each window is returned as a separate dataset object.
+  C++ implementation of the Python API `tf.data.Dataset.window()`.
 
-  The first element in the `k`th window will be element
+  Combines (nests of) input elements into a dataset of (nests of) windows.
+
+  A "window" is a finite dataset of flat elements of size `size` (or possibly
+  fewer if there are not enough input elements to fill the window and
+  `drop_remainder` evaluates to false).
+
+  The `shift` argument determines the number of input elements by which
+  the window moves on each iteration.  The first element in the `k`th window
+  will be element
   ```
   1 + (k-1) * shift
   ```
-  of the input dataset.  In particular, the first element of the first window 
+  of the input dataset. In particular, the first element of the first window
   will always be the first element of the input dataset.  
 
   If the `stride` parameter is greater than 1, then each window will skip
   `(stride - 1)` input elements between each element that appears in the
-  window.  Output windows will still contain `size` elements regardless of
+  window. Output windows will still contain `size` elements regardless of
   the value of `stride`.
 
-  If the underlying dataset returns elements in a deterministic order, then the
-  output of `WindowDataset` will be deterministic. Otherwise, the output of
-  this dataset will depend on the input order, but the contents of the windows 
-  will be consistent with a single ordering of the input dataset's elements.
+  The `stride` argument determines the stride of the input elements, and the
+  `shift` argument determines the shift of the window.
 
-  If the input dataset contains multiple tensors per element, this dataset will
-  return multiple datasets for each window. Each returned dataset will
-  correspond to a single window of one of the tensors in the input dataset.
+  For example, letting `{...}` to represent a Dataset:
+
+  - `tf.data.Dataset.range(7).window(2)` produces
+    `{{0, 1}, {2, 3}, {4, 5}, {6}}`
+  - `tf.data.Dataset.range(7).window(3, 2, 1, True)` produces
+    `{{0, 1, 2}, {2, 3, 4}, {4, 5, 6}}`
+  - `tf.data.Dataset.range(7).window(3, 1, 2, True)` produces
+    `{{0, 2, 4}, {1, 3, 5}, {2, 4, 6}}`
+
+  Note that when the `window` transformation is applied to a dataset of
+  nested elements, it produces a dataset of nested windows.
+
+  For example:
+
+  - `tf.data.Dataset.from_tensor_slices((range(4), range(4))).window(2)`
+    produces `{({0, 1}, {0, 1}), ({2, 3}, {2, 3})}`
+  - `tf.data.Dataset.from_tensor_slices({"a": range(4)}).window(2)`
+    produces `{{"a": {0, 1}}, {"a": {2, 3}}}`
+
+  If this dataset returns elements in a deterministic order, then the
+  dataset that this function returns will also be deterministic. Otherwise,
+  the output of the windowed dataset will depend on the input order, but the 
+  contents of the windows will be consistent with a single ordering of the
+  input dataset's elements.
 END
 }
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index ca551f49402..34c9b487eca 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -1550,12 +1550,26 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
 
     A "window" is a finite dataset of flat elements of size `size` (or possibly
     fewer if there are not enough input elements to fill the window and
-    `drop_remainder` evaluates to false).
+    `drop_remainder` evaluates to `False`).
+
+    The `shift` argument determines the number of input elements by which
+    the window moves on each iteration.  The first element in the `k`th window
+    will be element
+    ```
+    1 + (k-1) * shift
+    ```
+    of the input dataset. In particular, the first element of the first window
+    will always be the first element of the input dataset.
+
+    If the `stride` parameter is greater than 1, then each window will skip
+    `(stride - 1)` input elements between each element that appears in the
+    window. Output windows will still contain `size` elements regardless of
+    the value of `stride`.
 
     The `stride` argument determines the stride of the input elements, and the
     `shift` argument determines the shift of the window.
 
-    For example, letting {...} to represent a Dataset:
+    For example, letting `{...}` to represent a Dataset:
 
     - `tf.data.Dataset.range(7).window(2)` produces
       `{{0, 1}, {2, 3}, {4, 5}, {6}}`
@@ -1574,17 +1588,24 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
     - `tf.data.Dataset.from_tensor_slices({"a": range(4)}).window(2)`
       produces `{{"a": {0, 1}}, {"a": {2, 3}}}`
 
+    If this dataset returns elements in a deterministic order, then the
+    dataset that this function returns will also be deterministic. Otherwise,
+    the output of the windowed dataset will depend on the input order, but the
+    contents of the windows will be consistent with a single ordering of the
+    input dataset's elements.
+
     Args:
       size: A `tf.int64` scalar `tf.Tensor`, representing the number of elements
-        of the input dataset to combine into a window.
+        of the input dataset to combine into a window. Must be positive.
       shift: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
-        forward shift of the sliding window in each iteration. Defaults to
-        `size`.
+        number of input elements by which the window moves in each iteration.
+        Defaults to `size`. Must be positive.
       stride: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
-        stride of the input elements in the sliding window.
+        stride of the input elements in the sliding window. Must be positive.
+        The default value of 1 means "retain every input element".
       drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
-        whether a window should be dropped in case its size is smaller than
-        `window_size`.
+        whether the last window should be dropped if its size is smaller than
+        `size`.
 
     Returns:
       Dataset: A `Dataset` of (nests of) windows -- a finite datasets of flat

From 84bc9b4a8126cd42a2e479ff990fc242af1ea61e Mon Sep 17 00:00:00 2001
From: frreiss <frreiss@us.ibm.com>
Date: Wed, 4 Dec 2019 15:22:43 -0800
Subject: [PATCH 009/442] Address review comments

---
 .../base_api/api_def_WindowDataset.pbtxt       | 18 ++++++------------
 tensorflow/python/data/ops/dataset_ops.py      |  8 ++------
 2 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_WindowDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_WindowDataset.pbtxt
index d3f00dff113..2270f25967d 100644
--- a/tensorflow/core/api_def/base_api/api_def_WindowDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_WindowDataset.pbtxt
@@ -4,14 +4,14 @@ op {
   in_arg {
     name: "size"
     description: <<END
-A `tf.int64` scalar `tf.Tensor`, representing the number of elements
+An integer scalar, representing the number of elements
 of the input dataset to combine into a window. Must be positive.
 END
   }
   in_arg {
     name: "shift"
     description: <<END
-A `tf.int64` scalar `tf.Tensor`, representing the number of input elements
+An integer scalar, representing the number of input elements
 by which the window moves in each iteration.  Defaults to `size`.
 Must be positive.
 END
@@ -19,7 +19,7 @@ END
   in_arg {
     name: "stride"
     description: <<END
-A `tf.int64` scalar `tf.Tensor`, representing the stride of the input elements
+An integer scalar, representing the stride of the input elements
 in the sliding window. Must be positive. The default value of 1 means
 "retain every input element".
 END
@@ -27,13 +27,11 @@ END
   in_arg {
     name: "drop_remainder"
     description: <<END
-A `tf.bool` scalar `tf.Tensor`, representing whether the last window should be
+A Boolean scalar, representing whether the last window should be
 dropped if its size is smaller than `window_size`.
 END
   }
   summary: <<END
-  C++ implementation of the Python API `tf.data.Dataset.window()`.
-
   Combines (nests of) input elements into a dataset of (nests of) windows.
 
   A "window" is a finite dataset of flat elements of size `size` (or possibly
@@ -43,9 +41,11 @@ END
   The `shift` argument determines the number of input elements by which
   the window moves on each iteration.  The first element in the `k`th window
   will be element
+
   ```
   1 + (k-1) * shift
   ```
+
   of the input dataset. In particular, the first element of the first window
   will always be the first element of the input dataset.  
 
@@ -75,11 +75,5 @@ END
     produces `{({0, 1}, {0, 1}), ({2, 3}, {2, 3})}`
   - `tf.data.Dataset.from_tensor_slices({"a": range(4)}).window(2)`
     produces `{{"a": {0, 1}}, {"a": {2, 3}}}`
-
-  If this dataset returns elements in a deterministic order, then the
-  dataset that this function returns will also be deterministic. Otherwise,
-  the output of the windowed dataset will depend on the input order, but the 
-  contents of the windows will be consistent with a single ordering of the
-  input dataset's elements.
 END
 }
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index d57d7eed578..8cad13aa213 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -1758,9 +1758,11 @@ name=None))
     The `shift` argument determines the number of input elements by which
     the window moves on each iteration.  The first element in the `k`th window
     will be element
+
     ```
     1 + (k-1) * shift
     ```
+
     of the input dataset. In particular, the first element of the first window
     will always be the first element of the input dataset.
 
@@ -1815,12 +1817,6 @@ name=None))
     {'a': [1, 2]}
     {'a': [3, 4]}
 
-    If this dataset returns elements in a deterministic order, then the
-    dataset that this function returns will also be deterministic. Otherwise,
-    the output of the windowed dataset will depend on the input order, but the
-    contents of the windows will be consistent with a single ordering of the
-    input dataset's elements.
-
     Args:
       size: A `tf.int64` scalar `tf.Tensor`, representing the number of elements
         of the input dataset to combine into a window. Must be positive.

From c1e45341f61344dd5463426d38d430a58c45114a Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Wed, 16 Oct 2019 00:55:40 -0700
Subject: [PATCH 010/442] Allow an option to set CA file and CA Path to AWS SDK

---
 tensorflow/core/platform/s3/s3_file_system.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 936339079cf..ca6adfe37eb 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -124,6 +124,14 @@ Aws::Client::ClientConfiguration& GetDefaultClientConfig() {
         cfg.requestTimeoutMs = timeout;
       }
     }
+    const char* ca_file = getenv("S3_CA_FILE");
+    if (ca_file) {
+      cfg.caFile = Aws::String(ca_file);
+    }
+    const char* ca_path = getenv("S3_CA_PATH");
+    if (ca_path) {
+      cfg.caPath = Aws::String(ca_path);
+    }
 
     init = true;
   }

From 0d31c0bee8a1e06c7b4fa977ce2bc6ce347aa96f Mon Sep 17 00:00:00 2001
From: Hans Gaiser <h.gaiser@fizyr.com>
Date: Thu, 5 Dec 2019 14:49:27 +0100
Subject: [PATCH 011/442] Use _get_distribution_strategy only when it is
 available.

---
 tensorflow/python/keras/callbacks.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index bc2f0461fbc..b8d7761b608 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -1526,10 +1526,14 @@ class TensorBoard(Callback):
     """Sets Keras model and writes graph if specified."""
     self.model = model
 
-    # TensorBoard callback involves writing a summary file in a
-    # possibly distributed settings.
-    self._log_write_dir = distributed_file_utils.write_dirpath(
-        self.log_dir, self.model._get_distribution_strategy())  # pylint: disable=protected-access
+    # In case this callback is used via native Keras, _get_distribution_strategy does not exist.
+    if hasattr(self.model, '_get_distribution_strategy'):
+      # TensorBoard callback involves writing a summary file in a
+      # possibly distributed settings.
+      self._log_write_dir = distributed_file_utils.write_dirpath(
+          self.log_dir, self.model._get_distribution_strategy())  # pylint: disable=protected-access
+    else:
+      self._log_write_dir = self.log_dir
 
     with context.eager_mode():
       self._close_writers()
@@ -1725,9 +1729,11 @@ class TensorBoard(Callback):
     summary_state.writer = self._prev_summary_writer
     summary_state.step = self._prev_summary_step
 
-    # Safely remove the unneeded temp files.
-    distributed_file_utils.remove_temp_dirpath(
-        self.log_dir, self.model._get_distribution_strategy())  # pylint: disable=protected-access
+    # In case this callback is used via native Keras, _get_distribution_strategy does not exist.
+    if hasattr(self.model, '_get_distribution_strategy'):
+      # Safely remove the unneeded temp files.
+      distributed_file_utils.remove_temp_dirpath(
+          self.log_dir, self.model._get_distribution_strategy())  # pylint: disable=protected-access
 
   def _enable_trace(self):
     if context.executing_eagerly():

From 0f7b5e410f414464ec3e08ab1995c75d378af6cc Mon Sep 17 00:00:00 2001
From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com>
Date: Sat, 21 Dec 2019 20:39:51 +0530
Subject: [PATCH 012/442] Update save.py

---
 tensorflow/python/keras/saving/save.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py
index 3b2fa34df01..fb1ba7f05da 100644
--- a/tensorflow/python/keras/saving/save.py
+++ b/tensorflow/python/keras/saving/save.py
@@ -18,12 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 import sys
 
 import six
 
-from tensorflow.python import tf2
 from tensorflow.python.keras.engine import network
 from tensorflow.python.keras.saving import hdf5_format
 from tensorflow.python.keras.saving.saved_model import load as saved_model_load
@@ -99,9 +97,9 @@ def save_model(model,
 
   if type(filepath) != str and not isinstance(filepath, h5py.File):
     raise ValueError(
-      'Expected `filepath` to be a String or `h5py.File` object. Got'
-      'unsupported value %s of type %s'
-      % (filepath, type(filepath)))
+          'Expected `filepath` to be a String or `h5py.File` object. Got'
+          'unsupported value %s of type %s'
+          % (filepath, type(filepath)))
   save_format = network.validate_save_format(filepath, save_format)
 
   if save_format == 'h5':

From b641f6953f72c8c298614ea521981f4dc86ab446 Mon Sep 17 00:00:00 2001
From: Kaustubh Maske Patil <37668193+nikochiko@users.noreply.github.com>
Date: Sat, 21 Dec 2019 20:43:02 +0530
Subject: [PATCH 013/442] Update network.py

---
 tensorflow/python/keras/engine/network.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index bc83c7f3e7b..9b516dc2fc7 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -2042,10 +2042,10 @@ def validate_save_format(filepath, save_format):
       - `save_format` is "tf" but `filepath` is an h5py.File object.
   """
   if type(filepath) != str and not isinstance(filepath, h5py.File):
-      raise ValueError(
-          'Expected `filepath` to be a String or h5py.File object. Got'
-          'unsupported value %s of type %s'
-          % (filepath, type(filepath)))
+    raise ValueError(
+        'Expected `filepath` to be a String or h5py.File object. Got'
+        'unsupported value %s of type %s'
+        % (filepath, type(filepath)))
 
   filepath_is_h5py_file = h5py is not None and isinstance(filepath, h5py.File)
   filepath_is_h5 = type(filepath) == str and _is_hdf5_filepath(filepath)

From 25ec563a11639c583ac38ef626d598f9ee87208b Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Sat, 4 Jan 2020 13:11:01 +0530
Subject: [PATCH 014/442] Fix sanity

---
 tensorflow/python/keras/engine/network.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 9b516dc2fc7..0cfc96c7840 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -2051,9 +2051,9 @@ def validate_save_format(filepath, save_format):
   filepath_is_h5 = type(filepath) == str and _is_hdf5_filepath(filepath)
   if save_format is None:
     if filepath_is_h5 or filepath_is_h5py_file:
-        save_format = 'h5'
-      else:
-        save_format = 'tf' if tf2.enabled() else 'h5'
+      save_format = 'h5'
+    else:
+      save_format = 'tf' if tf2.enabled() else 'h5'
   else:
     user_format = save_format.lower().strip()
     if user_format in ('tensorflow', 'tf'):
@@ -2063,7 +2063,7 @@ def validate_save_format(filepath, save_format):
     else:
       raise ValueError(
           'Unknown format "%s". Was expecting one of {"tf", "h5"}.' % (
-          save_format,))
+          save_format))
   if save_format == 'tf' and filepath_is_h5:
     raise ValueError(
         ('Got save_format="tf"/"tensorflow", but the filepath ("%s") looks '

From 71dd20a99530f22c86a987088484db8f4f227e52 Mon Sep 17 00:00:00 2001
From: Lamar <rlamarrr@gmail.com>
Date: Thu, 9 Jan 2020 20:20:12 +0100
Subject: [PATCH 015/442] fixed static sized arrays with variable length

using const int or int for the size of an array implies that it has variable length (ill-formed, https://en.cppreference.com/w/cpp/language/ub), static arrays' lengths should be constexpr or a macro constant
---
 tensorflow/lite/micro/micro_utils_test.cc     | 6 +++---
 tensorflow/lite/micro/testing_helpers_test.cc | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/micro/micro_utils_test.cc b/tensorflow/lite/micro/micro_utils_test.cc
index e33d53b1c48..7aa31130595 100644
--- a/tensorflow/lite/micro/micro_utils_test.cc
+++ b/tensorflow/lite/micro/micro_utils_test.cc
@@ -82,7 +82,7 @@ TF_LITE_MICRO_TEST(FloatToAsymmetricQuantizedInt32Test) {
 TF_LITE_MICRO_TEST(AsymmetricQuantizeInt8) {
   float values[] = {-10.3, -3.1, -2.1, -1.9, -0.9, 0.1, 0.9, 1.85, 2.9, 4.1};
   int8_t goldens[] = {-20, -5, -3, -3, -1, 1, 3, 5, 7, 9};
-  const int length = sizeof(values) / sizeof(float);
+  constexpr int length = sizeof(values) / sizeof(float);
   int8_t quantized[length];
   tflite::AsymmetricQuantize(values, quantized, length, 0.5, 1);
   for (int i = 0; i < length; i++) {
@@ -93,7 +93,7 @@ TF_LITE_MICRO_TEST(AsymmetricQuantizeInt8) {
 TF_LITE_MICRO_TEST(AsymmetricQuantizeUInt8) {
   float values[] = {-10.3, -3.1, -2.1, -1.9, -0.9, 0.1, 0.9, 1.85, 2.9, 4.1};
   uint8_t goldens[] = {106, 121, 123, 123, 125, 127, 129, 131, 133, 135};
-  const int length = sizeof(values) / sizeof(float);
+  constexpr int length = sizeof(values) / sizeof(float);
   uint8_t quantized[length];
   tflite::AsymmetricQuantize(values, quantized, length, 0.5, 127);
   for (int i = 0; i < length; i++) {
@@ -104,7 +104,7 @@ TF_LITE_MICRO_TEST(AsymmetricQuantizeUInt8) {
 TF_LITE_MICRO_TEST(SymmetricQuantizeInt32) {
   float values[] = {-10.3, -3.1, -2.1, -1.9, -0.9, 0.1, 0.9, 1.85, 2.9, 4.1};
   int32_t goldens[] = {-21, -6, -4, -4, -2, 0, 2, 4, 6, 8};
-  const int length = sizeof(values) / sizeof(float);
+  constexpr int length = sizeof(values) / sizeof(float);
   int32_t quantized[length];
   tflite::SymmetricQuantize(values, quantized, length, 0.5);
   for (int i = 0; i < length; i++) {
diff --git a/tensorflow/lite/micro/testing_helpers_test.cc b/tensorflow/lite/micro/testing_helpers_test.cc
index a7fc2996eb9..478f5ae6336 100644
--- a/tensorflow/lite/micro/testing_helpers_test.cc
+++ b/tensorflow/lite/micro/testing_helpers_test.cc
@@ -21,7 +21,7 @@ TF_LITE_MICRO_TESTS_BEGIN
 TF_LITE_MICRO_TEST(CreateQuantizedBiasTensor) {
   float input_scale = 0.5;
   float weight_scale = 0.5;
-  const int tensor_size = 12;
+  constexpr int tensor_size = 12;
   int dims_arr[] = {4, 2, 3, 2, 1};
   const char* tensor_name = "test_tensor";
   int32_t quantized[tensor_size];
@@ -45,7 +45,7 @@ TF_LITE_MICRO_TEST(CreateQuantizedBiasTensor) {
 TF_LITE_MICRO_TEST(CreatePerChannelQuantizedBiasTensor) {
   float input_scale = 0.5;
   float weight_scales[] = {0.5, 1, 2, 4};
-  const int tensor_size = 12;
+  constexpr int tensor_size = 12;
   const int channels = 4;
   int dims_arr[] = {4, 4, 3, 1, 1};
   const char* tensor_name = "test_tensor";
@@ -78,7 +78,7 @@ TF_LITE_MICRO_TEST(CreatePerChannelQuantizedBiasTensor) {
 
 TF_LITE_MICRO_TEST(CreateSymmetricPerChannelQuantizedTensor) {
   const int tensor_size = 12;
-  const int channels = 2;
+  constexpr int channels = 2;
   const int dims_arr[] = {4, channels, 3, 2, 1};
   const char* tensor_name = "test_tensor";
   int8_t quantized[12];

From b2875d86f0f30fed4b3b947d01471d37503bcb16 Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Sat, 11 Jan 2020 10:18:48 +0530
Subject: [PATCH 016/442] Add tests

---
 .../python/keras/engine/network_test.py       | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/tensorflow/python/keras/engine/network_test.py b/tensorflow/python/keras/engine/network_test.py
index ff47e46dbac..2576454f4a3 100644
--- a/tensorflow/python/keras/engine/network_test.py
+++ b/tensorflow/python/keras/engine/network_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -1879,6 +1880,49 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
     self.assertEqual(network.stateful, False)
 
 
+class SaveFormatValidationTest(keras_parameterized.TestCase):
+
+  def test_save_format_validation(self):
+    filepath = 'file/path'
+    h5_filepath = 'h5_filepath.h5'
+    h5_filepath_2 = 'h5_filepath.hdf5'
+    h5_filepath_3 = 'h5_filepath.keras'
+
+    tf2.disable()
+    self.assertEqual(network_lib.validate_save_format(filepath, None), 'h5')
+
+    tf2.enable()
+    self.assertEqual(network_lib.validate_save_format(filepath, None), 'tf')
+
+    self.assertEqual(network_lib.validate_save_format(filepath, 'h5'), 'h5')
+    self.assertEqual(network_lib.validate_save_format(h5_filepath, None), 'h5')
+    self.assertEqual(
+        network_lib.validate_save_format(h5_filepath_2, None), 'h5')
+    self.assertEqual(
+        network_lib.validate_save_format(h5_filepath_3, None), 'h5')
+    self.assertEqual(
+        network_lib.validate_save_format(h5_filepath, 'hdf5'), 'h5')
+    self.assertEqual(
+        network_lib.validate_save_format(h5_filepath, 'keras'), 'h5')
+
+    self.assertEqual(network_lib.validate_save_format(filepath, 'tf'), 'tf')
+    self.assertEqual(
+        network_lib.validate_save_format(filepath, 'tensorflow'), 'tf')
+
+    with self.assertRaisesRegex(ValueError, 'Expected `filepath` to be a String\
+     or h5py.File object. Got unsupported value 42 of type int'):
+      network_lib.validate_save_format(42, 'h5')
+
+    with self.assertRaisesRegex(ValueError, 'Unknown format "%s". Was expecting\
+     one of {"tf", "h5"}.'):
+      network_lib.validate_save_format(filepath, 'unknown_format')
+
+    with self.assertRaisesRegex(ValueError, 'Got save_format="tf"/"tensorflow",\
+     but the filepath ("%s") looks like an HDF5 file. Omit the ".h5"/".keras"\
+     when saving in TensorFlow format.'.format(h5_filepath)):
+      network_lib.validate_save_format(h5_filepath, 'tf')
+
+
 
 if __name__ == '__main__':
   test.main()

From 616154eb62ad1ab2f89c5906253edab2bc141e2d Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Sat, 11 Jan 2020 10:19:33 +0530
Subject: [PATCH 017/442] Fix typo

---
 tensorflow/python/keras/engine/network_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/engine/network_test.py b/tensorflow/python/keras/engine/network_test.py
index 2576454f4a3..e74c42982cb 100644
--- a/tensorflow/python/keras/engine/network_test.py
+++ b/tensorflow/python/keras/engine/network_test.py
@@ -1919,7 +1919,7 @@ class SaveFormatValidationTest(keras_parameterized.TestCase):
 
     with self.assertRaisesRegex(ValueError, 'Got save_format="tf"/"tensorflow",\
      but the filepath ("%s") looks like an HDF5 file. Omit the ".h5"/".keras"\
-     when saving in TensorFlow format.'.format(h5_filepath)):
+     when saving in TensorFlow format.' % h5_filepath):
       network_lib.validate_save_format(h5_filepath, 'tf')
 
 

From f7678aa47f52d5955ff9ef65f9b527414675100c Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Sat, 11 Jan 2020 10:20:13 +0530
Subject: [PATCH 018/442] Fix typo

---
 tensorflow/python/keras/engine/network_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/engine/network_test.py b/tensorflow/python/keras/engine/network_test.py
index e74c42982cb..b707bb8e89e 100644
--- a/tensorflow/python/keras/engine/network_test.py
+++ b/tensorflow/python/keras/engine/network_test.py
@@ -1914,7 +1914,7 @@ class SaveFormatValidationTest(keras_parameterized.TestCase):
       network_lib.validate_save_format(42, 'h5')
 
     with self.assertRaisesRegex(ValueError, 'Unknown format "%s". Was expecting\
-     one of {"tf", "h5"}.'):
+     one of {"tf", "h5"}.' % 'unknown_format'):
       network_lib.validate_save_format(filepath, 'unknown_format')
 
     with self.assertRaisesRegex(ValueError, 'Got save_format="tf"/"tensorflow",\

From 8faed4f3d54afef9366f11b83dae505951768173 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= <mans.nilsson@arm.com>
Date: Thu, 9 Jan 2020 13:13:43 +0100
Subject: [PATCH 019/442] TFLu: Add stm32f4 and build target

Add new TARGET=stm32f4 that is working with Renode.
Add new <build> target that will just build the test binaries.
Add new CI script for this as well.
The purpose of this is CMSIS-NN regression.
---
 tensorflow/lite/micro/stm32f4/debug_log.cc    |  25 +++++
 .../lite/micro/tools/ci_build/test_all.sh     |   3 +
 .../lite/micro/tools/ci_build/test_stm32f4.sh |  40 +++++++
 tensorflow/lite/micro/tools/make/Makefile     |   7 +-
 .../micro/tools/make/helper_functions.inc     |   1 +
 .../tools/make/targets/stm32f4/stm32f4.lds    | 102 ++++++++++++++++++
 .../tools/make/targets/stm32f4_makefile.inc   |  86 +++++++++++++++
 7 files changed, 263 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/lite/micro/stm32f4/debug_log.cc
 create mode 100755 tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh
 create mode 100644 tensorflow/lite/micro/tools/make/targets/stm32f4/stm32f4.lds
 create mode 100644 tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc

diff --git a/tensorflow/lite/micro/stm32f4/debug_log.cc b/tensorflow/lite/micro/stm32f4/debug_log.cc
new file mode 100644
index 00000000000..311005fd1ca
--- /dev/null
+++ b/tensorflow/lite/micro/stm32f4/debug_log.cc
@@ -0,0 +1,25 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/debug_log.h"
+
+extern "C" void DebugLog(const char* s) {
+  asm("mov r0, #0x04\n"  // SYS_WRITE0
+      "mov r1, %[str]\n"
+      "bkpt #0xAB\n"
+      :
+      : [ str ] "r"(s)
+      : "r0", "r1");
+}
diff --git a/tensorflow/lite/micro/tools/ci_build/test_all.sh b/tensorflow/lite/micro/tools/ci_build/test_all.sh
index 28358610e96..873cb8b2506 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_all.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_all.sh
@@ -49,4 +49,7 @@ tensorflow/lite/micro/tools/ci_build/test_sparkfun.sh
 echo "Running x86 tests at `date`"
 tensorflow/lite/micro/tools/ci_build/test_x86.sh
 
+echo "Running stm32f4 tests at `date`"
+tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh
+
 echo "Finished all micro tests at `date`"
diff --git a/tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh b/tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh
new file mode 100755
index 00000000000..14e229c092f
--- /dev/null
+++ b/tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Tests the microcontroller code for stm32f4
+
+set -e
+
+TARGET=stm32f4
+TAGS=cmsis-nn
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR=${SCRIPT_DIR}/../../../../..
+cd ${ROOT_DIR}
+pwd
+
+source tensorflow/lite/micro/tools/ci_build/helper_functions.sh
+
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
+
+# TODO(b/143715361): downloading first to allow for parallel builds.
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile TAGS=${TAGS} TARGET=${TARGET} third_party_downloads
+
+# Build test binaries first
+readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TAGS=${TAGS} TARGET=${TARGET} build
+
+# Parallell builds doesn't work very well with this
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile TAGS=${TAGS} TARGET=${TARGET} test
+
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 224ee879cb5..7fb32175622 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -81,6 +81,8 @@ CC_PREFIX :=
 # runtime that can be linked in to other programs.
 MICROLITE_LIB_NAME := libtensorflow-microlite.a
 
+MICRO_LITE_EXAMPLE_TESTS := $(wildcard tensorflow/lite/micro/examples/*/Makefile.inc)
+
 MICROLITE_TEST_SRCS := \
 $(wildcard tensorflow/lite/micro/*test.cc) \
 $(wildcard tensorflow/lite/micro/kernels/*test.cc) \
@@ -240,7 +242,7 @@ CC := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}${CC_TOOL}
 AR := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}${AR_TOOL}
 
 # Load the examples.
-include $(wildcard tensorflow/lite/micro/examples/*/Makefile.inc)
+include $(MICRO_LITE_EXAMPLE_TESTS)
 
 # Create rules for downloading third-party dependencies.
 THIRD_PARTY_TARGETS :=
@@ -308,6 +310,9 @@ $(eval $(call microlite_test,kernel_$(notdir $(basename $(TEST_TARGET))),$(TEST_
 
 test: $(MICROLITE_TEST_TARGETS)
 
+# Just build the test targets
+build: $(MICROLITE_BUILD_TARGETS)
+
 generate_projects: $(ALL_PROJECT_TARGETS)
 
 generate_non_kernel_projects: $(filter-out generate_kernel%,$(ALL_PROJECT_TARGETS))
diff --git a/tensorflow/lite/micro/tools/make/helper_functions.inc b/tensorflow/lite/micro/tools/make/helper_functions.inc
index 5a162675f85..ca357c55f5c 100644
--- a/tensorflow/lite/micro/tools/make/helper_functions.inc
+++ b/tensorflow/lite/micro/tools/make/helper_functions.inc
@@ -371,6 +371,7 @@ test_$(1): $$($(1)_BINARY)
 	$$(TEST_SCRIPT) $$($(1)_BINARY) '~~~ALL TESTS PASSED~~~'
 ifneq (,$(findstring _test,$(1)))
   MICROLITE_TEST_TARGETS += test_$(1)
+  MICROLITE_BUILD_TARGETS += $$($(1)_BINARY)
 endif
 $(eval $(call generate_microlite_projects,$(1),$(call specialize,$(2)),$(3)))
 endef
diff --git a/tensorflow/lite/micro/tools/make/targets/stm32f4/stm32f4.lds b/tensorflow/lite/micro/tools/make/targets/stm32f4/stm32f4.lds
new file mode 100644
index 00000000000..6ecde0000b2
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/stm32f4/stm32f4.lds
@@ -0,0 +1,102 @@
+/* Copyright 2020 Google Inc. All Rights Reserved.
+
+Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* Copied and modified from: tensorflow/lite/micro/tools/make/targets/bluepill/bluepill.lds
+
+*/
+
+/*
+ * 0x00000000 - 0x07ffffff - aliased to flash or sys memory depending on BOOT jumpers.
+ * 0x08000000 - 0x0801ffff - Flash.
+ * 0x1ffff000 - 0x1ffff7ff - Boot firmware in system memory.
+ * 0x1ffff800 - 0x1fffffff - Option bytes.
+ * 0x20000000 - 0x20004fff - SRAM.
+ * 0x40000000 - 0x40023400 - Peripherals
+ */
+
+/* Define main entry point */
+ENTRY(_main)
+
+/* 20K of RAM and 128K of FLASH */
+MEMORY {
+RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 20K
+FLASH (rx) : ORIGIN = 0x8000000, LENGTH = 256K
+}
+
+/* Compute where the stack ends rather than hard coding it */
+_ld_stack_end_addr = ORIGIN(RAM) + LENGTH(RAM);
+_ld_min_stack_size = 0x200;
+
+SECTIONS {
+
+/* interrupt vector goes to top of flash */
+
+.interrupt_vector : {
+  . = ALIGN(4);
+  KEEP(*(.interrupt_vector))
+  . = ALIGN(4);
+} >FLASH
+
+/* read only .text and .rodata go to flash */
+
+.text : {
+  . = ALIGN(4);
+  KEEP(*(.text.interrupt_handler))
+  *(.text*)
+} >FLASH
+
+.rodata : {
+  . = ALIGN(4);
+  *(.rodata*)
+  . = ALIGN(4);
+} >FLASH
+
+/* read mwrite data needs to be stored in flash but copied to ram */
+.data : {
+  . = ALIGN(4);
+  _ld_data_load_dest_start = .; /* export where to load from */
+  *(.data*)
+  . = ALIGN(4);
+  _ld_data_load_dest_stop = .; /* export where to load from */
+} >RAM AT> FLASH
+_ld_data_load_source = LOADADDR(.data);
+
+/* unitialized data section needs zero initialization */
+.bss :
+{
+  . = ALIGN(4);
+  _ld_bss_data_start = .;
+  *(.bss*)
+  . = ALIGN(4);
+  _ld_bss_data_stop = .;
+} >RAM
+
+._user_heap_stack :
+{
+  . = ALIGN(8);
+  . += _ld_min_stack_size;
+  PROVIDE(end = .);
+  . = ALIGN(8);
+} >RAM
+
+/DISCARD/ :
+{
+  libc.a (*)
+  libm.a (*)
+  libgcc.a (*)
+}
+
+}  /* SECTIONS */
diff --git a/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc b/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
new file mode 100644
index 00000000000..b99e11e0328
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
@@ -0,0 +1,86 @@
+# Settings for stm32f4 based platforms
+ifeq ($(TARGET), stm32f4)
+  export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH)
+  TARGET_ARCH := cortex-m4
+  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
+
+  $(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,))
+  $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,))
+  $(eval $(call add_third_party_download,$(STM32_BARE_LIB_URL),$(STM32_BARE_LIB_MD5),stm32_bare_lib,))
+
+  PLATFORM_FLAGS = \
+    -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
+    -DTF_LITE_STATIC_MEMORY \
+    -DTF_LITE_MCU_DEBUG_LOG \
+    -fno-rtti \
+    -fmessage-length=0 \
+    -fno-exceptions \
+    -fno-unwind-tables \
+    -fno-builtin \
+    -ffunction-sections \
+    -fdata-sections \
+    -funsigned-char \
+    -MMD \
+    -mcpu=cortex-m4 \
+    -mthumb \
+    -std=gnu++11 \
+    -Wvla \
+    -Wall \
+    -Wextra \
+    -Wno-unused-parameter \
+    -Wno-missing-field-initializers \
+    -Wno-write-strings \
+    -Wno-sign-compare \
+    -fno-delete-null-pointer-checks \
+    -fomit-frame-pointer \
+    -fpermissive \
+    -g \
+    -Os
+  CXXFLAGS += $(PLATFORM_FLAGS)
+  CCFLAGS += $(PLATFORM_FLAGS)
+  LDFLAGS += \
+    --specs=nosys.specs \
+    -T $(MAKEFILE_DIR)/targets/stm32f4/stm32f4.lds \
+    -Wl,-Map=$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref \
+    -Wl,--gc-sections
+  BUILD_TYPE := micro
+  MICROLITE_LIBS := \
+    -lm
+  INCLUDES += \
+    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
+    -I$(MAKEFILE_DIR)/downloads/stm32_bare_lib/include
+  MICROLITE_CC_SRCS += \
+    $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.c) \
+    $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.cc)
+  EXCLUDED_SRCS := \
+    $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/debug_log.c
+  MICROLITE_CC_SRCS := $(filter-out $(EXCLUDED_SRCS), $(MICROLITE_CC_SRCS))
+  # Stm32f4 is reusing the bluepill renode scripts for now
+  TEST_SCRIPT := tensorflow/lite/micro/testing/test_bluepill_binary.sh
+  # TODO, non working tests.. the micro_speech example and conv_test.cc/depthwise_conv_test.cc partly works
+  EXCLUDED_TESTS := \
+    tensorflow/lite/micro/micro_interpreter_test.cc \
+    tensorflow/lite/micro/micro_allocator_test.cc \
+    tensorflow/lite/micro/memory_helpers_test.cc \
+    tensorflow/lite/micro/kernels/depthwise_conv_test.cc \
+    tensorflow/lite/micro/kernels/conv_test.cc \
+    tensorflow/lite/micro/simple_tensor_allocator_test.cc
+  MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
+  EXCLUDED_EXAMPLE_TESTS := \
+    tensorflow/lite/micro/examples/magic_wand/Makefile.inc \
+    tensorflow/lite/micro/examples/person_detection/Makefile.inc \
+    tensorflow/lite/micro/examples/person_detection_experimental/Makefile.inc \
+    tensorflow/lite/micro/examples/mobilenet_v2/Makefile.inc \
+    tensorflow/lite/micro/examples/micro_speech/Makefile.inc \
+    tensorflow/lite/micro/examples/ds_cnn_l/Makefile.inc
+  MICRO_LITE_EXAMPLE_TESTS := $(filter-out $(EXCLUDED_EXAMPLE_TESTS), $(MICRO_LITE_EXAMPLE_TESTS))
+
+# These are microcontroller-specific rules for converting the ELF output
+# of the linker into a binary image that can be loaded directly.
+OBJCOPY := $(TARGET_TOOLCHAIN_PREFIX)objcopy
+
+$(BINDIR)/%.bin: $(BINDIR)/%
+	@mkdir -p $(dir $@)
+	$(OBJCOPY) $< $@ -O binary
+
+endif

From ed752449a943d60875e059ae0d6d05766f175c1f Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Thu, 16 Jan 2020 21:22:38 +0530
Subject: [PATCH 020/442] Fix spacing

---
 tensorflow/python/keras/engine/network.py      |  2 +-
 tensorflow/python/keras/engine/network_test.py | 10 +++-------
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 0cfc96c7840..9fbc4363209 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -2043,7 +2043,7 @@ def validate_save_format(filepath, save_format):
   """
   if type(filepath) != str and not isinstance(filepath, h5py.File):
     raise ValueError(
-        'Expected `filepath` to be a String or h5py.File object. Got'
+        'Expected `filepath` to be a String or h5py.File object. Got '
         'unsupported value %s of type %s'
         % (filepath, type(filepath)))
 
diff --git a/tensorflow/python/keras/engine/network_test.py b/tensorflow/python/keras/engine/network_test.py
index b707bb8e89e..dd902128909 100644
--- a/tensorflow/python/keras/engine/network_test.py
+++ b/tensorflow/python/keras/engine/network_test.py
@@ -1909,17 +1909,13 @@ class SaveFormatValidationTest(keras_parameterized.TestCase):
     self.assertEqual(
         network_lib.validate_save_format(filepath, 'tensorflow'), 'tf')
 
-    with self.assertRaisesRegex(ValueError, 'Expected `filepath` to be a String\
-     or h5py.File object. Got unsupported value 42 of type int'):
+    with self.assertRaises(ValueError):
       network_lib.validate_save_format(42, 'h5')
 
-    with self.assertRaisesRegex(ValueError, 'Unknown format "%s". Was expecting\
-     one of {"tf", "h5"}.' % 'unknown_format'):
+    with self.assertRaises(ValueError):
       network_lib.validate_save_format(filepath, 'unknown_format')
 
-    with self.assertRaisesRegex(ValueError, 'Got save_format="tf"/"tensorflow",\
-     but the filepath ("%s") looks like an HDF5 file. Omit the ".h5"/".keras"\
-     when saving in TensorFlow format.' % h5_filepath):
+    with self.assertRaises(ValueError):
       network_lib.validate_save_format(h5_filepath, 'tf')
 
 

From a22df00354a51030294902b9f047c2f71c088851 Mon Sep 17 00:00:00 2001
From: TengLu <teng.lu@intel.com>
Date: Fri, 17 Jan 2020 17:09:53 +0800
Subject: [PATCH 021/442] Add weight cache for FP32 MatMul.

---
 tensorflow/core/graph/mkl_layout_pass.cc      |  17 +-
 tensorflow/core/kernels/mkl_fused_ops_test.cc | 294 +++++++++++-------
 .../core/kernels/mkl_matmul_op_fused.cc       |  63 ++--
 .../core/kernels/mkl_matmul_ops_common.h      |  83 ++++-
 tensorflow/core/kernels/mkl_qmatmul_op.cc     | 101 +-----
 tensorflow/core/ops/mkl_nn_ops.cc             |   1 +
 6 files changed, 338 insertions(+), 221 deletions(-)

diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 551193262e2..fae5af1961e 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -483,7 +483,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                       CopyAttrsFusedConv2D, FusedConv2DRewrite,
                       kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.fused_matmul, csinfo_.mkl_fused_matmul,
-                      CopyAttrsAll, FusedMatMulRewrite});
+                      CopyAttrsAllCheckConstFilter, FusedMatMulRewrite});
 
 #ifndef ENABLE_MKLDNN_V1
     rinfo_.push_back({csinfo_.identity,
@@ -1877,6 +1877,9 @@ rinfo_.push_back({csinfo_.tanh_grad,
   // NOTE: names are alphabetically sorted.
   static void CopyAttrsAll(const Node* orig_node, NodeBuilder* nb,
                            bool change_format = false);
+  static void CopyAttrsAllCheckConstFilter(const Node* orig_node,
+                                           NodeBuilder* nb,
+                                           bool change_format = false);
 
   static void CopyAttrsConv(const Node* orig_node, NodeBuilder* nb,
                             bool change_format = false);
@@ -2468,6 +2471,18 @@ void MklLayoutRewritePass::CopyAttrsAll(const Node* orig_node, NodeBuilder* nb,
   }
 }
 
+// Generic function to copy all attributes and check if filter is const.
+void MklLayoutRewritePass::CopyAttrsAllCheckConstFilter(const Node* orig_node,
+                                                        NodeBuilder* nb,
+                                                        bool change_format) {
+  CopyAttrsAll(orig_node, nb, change_format);
+
+  // Check and set filter attribute.
+  Node* filter_node = nullptr;
+  TF_CHECK_OK(orig_node->input_node(1, &filter_node));
+  nb->Attr("is_filter_const", filter_node->IsConstant());
+}
+
 void MklLayoutRewritePass::CopyAttrsConvCheckConstFilter(const Node* orig_node,
                                                          NodeBuilder* nb,
                                                          bool change_format) {
diff --git a/tensorflow/core/kernels/mkl_fused_ops_test.cc b/tensorflow/core/kernels/mkl_fused_ops_test.cc
index 90595c47b93..410f701c824 100644
--- a/tensorflow/core/kernels/mkl_fused_ops_test.cc
+++ b/tensorflow/core/kernels/mkl_fused_ops_test.cc
@@ -301,25 +301,24 @@ class MklFusedConv2DOpTest : public OpsTestBase {
                          int depth = kDepth, int image_width = kImageWidth,
                          int image_height = kImageHeight,
                          int image_batch_count = kImageBatchCount) {
-    const FusedGraphRunner run_default =
-        [this](const Tensor& input_data, const Tensor& filter_data,
-               const Tensor& bias_data, const std::vector<string>& fused_ops,
-               Tensor* out) {
-          RunConv2DUnfused(input_data, filter_data, bias_data, fused_ops, out);
-        };
+    const FusedGraphRunner run_default = [this](
+        const Tensor& input_data, const Tensor& filter_data,
+        const Tensor& bias_data, const std::vector<string>& fused_ops,
+        Tensor* out) {
+      RunConv2DUnfused(input_data, filter_data, bias_data, fused_ops, out);
+    };
 
-    const FusedGraphRunner run_fused =
-        [this](const Tensor& input_data, const Tensor& filter_data,
-               const Tensor& bias_data, const std::vector<string>& fused_ops,
-               Tensor* out) {
-          std::vector<Tensor> fused_input = {bias_data};
-          if (std::find(fused_ops.begin(), fused_ops.end(), "Add") !=
-              fused_ops.end()) {
-            fused_input.push_back(input_data);
-          }
-          RunMklFusedConv2DOp(input_data, filter_data, fused_input, fused_ops,
-                              out);
-        };
+    const FusedGraphRunner run_fused = [this](
+        const Tensor& input_data, const Tensor& filter_data,
+        const Tensor& bias_data, const std::vector<string>& fused_ops,
+        Tensor* out) {
+      std::vector<Tensor> fused_input = {bias_data};
+      if (std::find(fused_ops.begin(), fused_ops.end(), "Add") !=
+          fused_ops.end()) {
+        fused_input.push_back(input_data);
+      }
+      RunMklFusedConv2DOp(input_data, filter_data, fused_input, fused_ops, out);
+    };
 
     CommonTestUtilities<T>::VerifyFusedTensorsClose(
         depth, image_width, image_height, image_batch_count, filter_size,
@@ -623,86 +622,86 @@ class MklFusedMatMulOpTest : public OpsTestBase {
   void VerifyFusedMatMul(const int kBatch, const int kInputChannel,
                          const int kOutputChannel,
                          const std::vector<string>& fused_ops) {
-    const FusedGraphRunner run_default =
-        [this](const Tensor& input, const Tensor& weight, const Tensor& bias,
-               const std::vector<string>& fused_ops, Tensor* output) {
-          auto root = tensorflow::Scope::NewRootScope();
-          auto input_op =
-              ops::Const(root.WithOpName("input"), Input::Initializer(input));
-          Output next_op = ops::MatMul(root.WithOpName("matmul"), input_op,
-                                       ops::Const(root.WithOpName("weight"),
-                                                  Input::Initializer(weight)));
+    const FusedGraphRunner run_default = [this](
+        const Tensor& input, const Tensor& weight, const Tensor& bias,
+        const std::vector<string>& fused_ops, Tensor* output) {
+      auto root = tensorflow::Scope::NewRootScope();
+      auto input_op =
+          ops::Const(root.WithOpName("input"), Input::Initializer(input));
+      Output next_op = ops::MatMul(
+          root.WithOpName("matmul"), input_op,
+          ops::Const(root.WithOpName("weight"), Input::Initializer(weight)));
 
-          string last_op = "";
-          if (std::find(fused_ops.begin(), fused_ops.end(), "BiasAdd") !=
-              fused_ops.end()) {
-            last_op = "with_bias";
-            next_op = ops::BiasAdd(
-                root.WithOpName(last_op), next_op,
-                ops::Const(root.WithOpName("bias"), Input::Initializer(bias)));
-          }
+      string last_op = "";
+      if (std::find(fused_ops.begin(), fused_ops.end(), "BiasAdd") !=
+          fused_ops.end()) {
+        last_op = "with_bias";
+        next_op = ops::BiasAdd(
+            root.WithOpName(last_op), next_op,
+            ops::Const(root.WithOpName("bias"), Input::Initializer(bias)));
+      }
 
-          if (std::find(fused_ops.begin(), fused_ops.end(), "Relu") !=
-              fused_ops.end()) {
-            last_op = "with_relu";
-            next_op = ops::Relu(root.WithOpName(last_op), next_op);
-          }
+      if (std::find(fused_ops.begin(), fused_ops.end(), "Relu") !=
+          fused_ops.end()) {
+        last_op = "with_relu";
+        next_op = ops::Relu(root.WithOpName(last_op), next_op);
+      }
 
-          if (std::find(fused_ops.begin(), fused_ops.end(), "Relu6") !=
-              fused_ops.end()) {
-            last_op = "with_relu6";
-            next_op = ops::Relu6(root.WithOpName(last_op), next_op);
-          }
+      if (std::find(fused_ops.begin(), fused_ops.end(), "Relu6") !=
+          fused_ops.end()) {
+        last_op = "with_relu6";
+        next_op = ops::Relu6(root.WithOpName(last_op), next_op);
+      }
 
-          if (std::find(fused_ops.begin(), fused_ops.end(), "Elu") !=
-              fused_ops.end()) {
-            last_op = "with_elu";
-            next_op = ops::Elu(root.WithOpName(last_op), next_op);
-          }
+      if (std::find(fused_ops.begin(), fused_ops.end(), "Elu") !=
+          fused_ops.end()) {
+        last_op = "with_elu";
+        next_op = ops::Elu(root.WithOpName(last_op), next_op);
+      }
 
-          CommonTestUtilities<T>::RunAndFetch(root, last_op, output);
-        };
+      CommonTestUtilities<T>::RunAndFetch(root, last_op, output);
+    };
 
-    const FusedGraphRunner run_fused =
-        [this](const Tensor& input, const Tensor& weight, const Tensor& bias,
-               const std::vector<string>& fused_ops, Tensor* output) {
-          DataType dtype = DataTypeToEnum<T>::v();
-          const int num_args = 1;
+    const FusedGraphRunner run_fused = [this](
+        const Tensor& input, const Tensor& weight, const Tensor& bias,
+        const std::vector<string>& fused_ops, Tensor* output) {
+      DataType dtype = DataTypeToEnum<T>::v();
+      const int num_args = 1;
 
-          TF_EXPECT_OK(NodeDefBuilder("MklFusedMatMul", "_MklFusedMatMul")
-                           .Input(FakeInput(dtype))
-                           .Input(FakeInput(dtype))
-                           .Input(FakeInput(num_args, dtype))
-                           .Input(FakeInput(DT_UINT8))
-                           .Input(FakeInput(DT_UINT8))
-                           .Input(FakeInput(num_args, DT_UINT8))
-                           .Attr("T", dtype)
-                           .Attr("transpose_a", false)
-                           .Attr("transpose_b", false)
-                           .Attr("num_args", num_args)
-                           .Attr("fused_ops", fused_ops)
-                           .Attr("epsilon", 0.0001)
-                           .Attr("_kernel", "MklLayoutDependentOp")
-                           .Finalize(node_def()));
+      TF_EXPECT_OK(NodeDefBuilder("MklFusedMatMul", "_MklFusedMatMul")
+                       .Input(FakeInput(dtype))
+                       .Input(FakeInput(dtype))
+                       .Input(FakeInput(num_args, dtype))
+                       .Input(FakeInput(DT_UINT8))
+                       .Input(FakeInput(DT_UINT8))
+                       .Input(FakeInput(num_args, DT_UINT8))
+                       .Attr("T", dtype)
+                       .Attr("transpose_a", false)
+                       .Attr("transpose_b", false)
+                       .Attr("num_args", num_args)
+                       .Attr("fused_ops", fused_ops)
+                       .Attr("epsilon", 0.0001)
+                       .Attr("_kernel", "MklLayoutDependentOp")
+                       .Finalize(node_def()));
 
-          TF_EXPECT_OK(InitOp());
+      TF_EXPECT_OK(InitOp());
 
-          AddInputFromArray<T>(input.shape(), input.flat<T>());
-          AddInputFromArray<T>(weight.shape(), weight.flat<T>());
-          AddInputFromArray<T>(bias.shape(), bias.flat<T>());
-          // Add MKL meta input for input, filter and bias.
-          AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
-          AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
-          AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+      AddInputFromArray<T>(input.shape(), input.flat<T>());
+      AddInputFromArray<T>(weight.shape(), weight.flat<T>());
+      AddInputFromArray<T>(bias.shape(), bias.flat<T>());
+      // Add MKL meta input for input, filter and bias.
+      AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+      AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+      AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
 
-          TF_ASSERT_OK(RunOpKernel());
+      TF_ASSERT_OK(RunOpKernel());
 
-          const Tensor& output_tensor = *GetOutput(0);
-          const Tensor& output_meta_tensor = *GetOutput(1);
-          CommonTestUtilities<T> test_util;
-          test_util.PerformConversion(dtype, output_tensor, output_meta_tensor,
-                                      output);
-        };
+      const Tensor& output_tensor = *GetOutput(0);
+      const Tensor& output_meta_tensor = *GetOutput(1);
+      CommonTestUtilities<T> test_util;
+      test_util.PerformConversion(dtype, output_tensor, output_meta_tensor,
+                                  output);
+    };
 
     CommonTestUtilities<T>::VerifyFusedMatrixClose(kInputChannel, kBatch,
                                                    kOutputChannel, fused_ops,
@@ -757,6 +756,84 @@ using MklFusedMatMulDataTypes = ::testing::Types<float>;
 INSTANTIATE_TYPED_TEST_CASE_P(Test, MklFusedMatMulOpTest,
                               MklFusedMatMulDataTypes);
 
+// Test the performance of MklFusedMatMul weight cache.
+// For the first time B matrix will be reordered and cached which will be
+// used for subsequent runs
+class MklFusedMatMulCacheTest : public OpsTestBase {};
+
+TEST_F(MklFusedMatMulCacheTest, WeightCached) {
+  const int num_args = 1;
+  const std::vector<string>& fused_ops = {"BiasAdd"};
+
+  TF_ASSERT_OK(NodeDefBuilder("MklFusedMatMul", "_MklFusedMatMul")
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(num_args, DT_FLOAT))
+                   .Input(FakeInput(DT_UINT8))
+                   .Input(FakeInput(DT_UINT8))
+                   .Input(FakeInput(num_args, DT_UINT8))
+                   .Attr("T", DT_FLOAT)
+                   .Attr("transpose_a", false)
+                   .Attr("transpose_b", false)
+                   .Attr("num_args", num_args)
+                   .Attr("fused_ops", fused_ops)
+                   .Attr("epsilon", 0.0001)
+                   .Attr("_kernel", "MklLayoutDependentOp")
+                   .Finalize(node_def()));
+
+  TF_EXPECT_OK(InitOp());
+  // The tensor shape of (1,3) is selected to allow the mkldnn expected
+  // weight format to be made as OI rather than IO for BS > 1
+  // A matrix is:
+  // |  1 |  2 |  3 |
+  AddInputFromArray<float>(TensorShape({1, 3}), {1, 2, 3});
+  // B matrix is:
+  // |  7 |  8 |  9 | 10 |
+  // | 11 | 12 | 13 | 14 |
+  // | 15 | 16 | 17 | 18 |
+  AddInputFromArray<float>(TensorShape({3, 4}),
+                           {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
+  // Bias vector.
+  AddInputFromArray<float>(TensorShape({4}), {1, 2, 3, 4});
+  // Add MKL meta input for input, filter and bias.
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+
+  int64 start_time = Env::Default()->NowMicros();
+  TF_ASSERT_OK(RunOpKernel());
+  int64 end_time = Env::Default()->NowMicros();
+  int64 total_duration_unopt = end_time - start_time;
+
+  // Final result after Bias addition:
+  // | 75 | 82 | 89 | 96 |
+  Tensor expected(DT_FLOAT, TensorShape({1, 4}));
+  test::FillValues<float>(&expected, {75, 82, 89, 96});
+
+  const Tensor& output = *GetOutput(0);
+  const Tensor& mkl_shape_tensor = *GetOutput(1);
+  CommonTestUtilities<float> test_util;
+  test_util.ConvertAndCompare(DT_FLOAT, output, mkl_shape_tensor, expected);
+
+  // Test for the second time to use the cached weight
+  start_time = Env::Default()->NowMicros();
+  TF_ASSERT_OK(RunOpKernel());
+  end_time = Env::Default()->NowMicros();
+  int64 total_duration_opt = end_time - start_time;
+  LOG(INFO) << " Time taken by first call : " << total_duration_unopt
+            << ", Time taken after Caching : " << total_duration_opt;
+
+  // Cached call should be at least 20% faster.
+  EXPECT_LT(total_duration_opt, total_duration_unopt * 0.8);
+
+  // Compare the result with expected result
+  CommonTestUtilities<float> test_util_new;
+  const Tensor& output_new = *GetOutput(0);
+  const Tensor& mkl_shape_tensor_new = *GetOutput(1);
+  test_util_new.ConvertAndCompare(DT_FLOAT, output_new, mkl_shape_tensor_new,
+                                  expected);
+}
+
 class BiasCacheTest : public OpsTestBase {
  public:
   template <typename T>
@@ -906,19 +983,18 @@ class MklPadWithFusedConv2DOpTest : public OpsTestBase {
                                   int image_width = kImageWidth,
                                   int image_height = kImageHeight,
                                   int image_batch_count = kImageBatchCount) {
-    const BiasAddGraphRunner run_default = [this](const Tensor& input_data,
-                                                  const Tensor& filter_data,
-                                                  const Tensor& bias_data,
-                                                  Tensor* out) {
+    const BiasAddGraphRunner run_default = [this](
+        const Tensor& input_data, const Tensor& filter_data,
+        const Tensor& bias_data, Tensor* out) {
       RunMklPadWithFusedConv2DAndBias(input_data, filter_data, bias_data, out);
     };
 
-    const BiasAddGraphRunner run_fused =
-        [this](const Tensor& input_data, const Tensor& filter_data,
-               const Tensor& bias_data, Tensor* out) {
-          RunMklFusedConv2DWithPadOp(input_data, filter_data, {bias_data},
-                                     {"BiasAdd"}, out);
-        };
+    const BiasAddGraphRunner run_fused = [this](
+        const Tensor& input_data, const Tensor& filter_data,
+        const Tensor& bias_data, Tensor* out) {
+      RunMklFusedConv2DWithPadOp(input_data, filter_data, {bias_data},
+                                 {"BiasAdd"}, out);
+    };
 
     CommonTestUtilities<T>::VerifyBiasAddTensorsClose(
         depth, image_width, image_height, image_batch_count, filter_size,
@@ -931,19 +1007,19 @@ class MklPadWithFusedConv2DOpTest : public OpsTestBase {
       int filter_size, int filter_count, int depth = kDepth,
       int image_width = kImageWidth, int image_height = kImageHeight,
       int image_batch_count = kImageBatchCount) {
-    const BiasAddGraphRunner run_default =
-        [this](const Tensor& input_data, const Tensor& filter_data,
-               const Tensor& bias_data, Tensor* out) {
-          RunMklPadWithFusedConv2DAndBiasRelu(input_data, filter_data,
-                                              bias_data, out);
-        };
+    const BiasAddGraphRunner run_default = [this](
+        const Tensor& input_data, const Tensor& filter_data,
+        const Tensor& bias_data, Tensor* out) {
+      RunMklPadWithFusedConv2DAndBiasRelu(input_data, filter_data, bias_data,
+                                          out);
+    };
 
-    const BiasAddGraphRunner run_fused =
-        [this](const Tensor& input_data, const Tensor& filter_data,
-               const Tensor& bias_data, Tensor* out) {
-          RunMklFusedConv2DWithPadOp(input_data, filter_data, {bias_data},
-                                     {"BiasAdd", "Relu"}, out);
-        };
+    const BiasAddGraphRunner run_fused = [this](
+        const Tensor& input_data, const Tensor& filter_data,
+        const Tensor& bias_data, Tensor* out) {
+      RunMklFusedConv2DWithPadOp(input_data, filter_data, {bias_data},
+                                 {"BiasAdd", "Relu"}, out);
+    };
 
     CommonTestUtilities<T>::VerifyBiasAddTensorsClose(
         depth, image_width, image_height, image_batch_count, filter_size,
diff --git a/tensorflow/core/kernels/mkl_matmul_op_fused.cc b/tensorflow/core/kernels/mkl_matmul_op_fused.cc
index 02495f672d2..5cfde35ee0a 100644
--- a/tensorflow/core/kernels/mkl_matmul_op_fused.cc
+++ b/tensorflow/core/kernels/mkl_matmul_op_fused.cc
@@ -28,13 +28,15 @@ namespace tensorflow {
 
 // Fuse Operation
 template <typename Device, typename T>
-class MklFusedMatMulOp : public MklDnnMatMulOpBase<T> {
+class MklFusedMatMulOp : public MklDnnMatMulOpBase<T, T> {
  public:
   explicit MklFusedMatMulOp(OpKernelConstruction* ctx)
-      : MklDnnMatMulOpBase<T>(ctx) {
+      : MklDnnMatMulOpBase<T, T>(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("fused_ops", &fused_ops_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_));
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetAttr("is_filter_const", &(this->is_weight_const_)));
 
     OP_REQUIRES(ctx, fused_ops_.size() <= 2,
                 errors::InvalidArgument(
@@ -58,13 +60,13 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T> {
     MklDnnShape weight_mkl_shape;
     GetMklShape(ctx, this->kInputIndexSrc, &src_mkl_shape);
     GetMklShape(ctx, this->kInputIndexWeight, &weight_mkl_shape);
+    OP_REQUIRES(ctx, !weight_mkl_shape.IsMklTensor(),
+                errors::InvalidArgument("Weight should not be in MKL Layout"));
 
     // Get shapes of input tensors
     auto src_tf_shape = src_mkl_shape.IsMklTensor() ? src_mkl_shape.GetTfShape()
                                                     : src_tensor.shape();
-    auto weight_tf_shape = weight_mkl_shape.IsMklTensor()
-                               ? weight_mkl_shape.GetTfShape()
-                               : weight_tensor.shape();
+    auto weight_tf_shape = weight_tensor.shape();
 
     // Check the constraint of input matrix and bias
     OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(src_tf_shape),
@@ -84,11 +86,10 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T> {
     const int k = src_tf_shape.dim_size(dim_pair[0]);
     const int channel = weight_tf_shape.dim_size(1 - dim_pair[1]);
 
-    OP_REQUIRES(
-        ctx, k == weight_tf_shape.dim_size(dim_pair[1]),
-        errors::InvalidArgument(
-            "Matrix size-incompatible: In[0]: ", src_tf_shape.DebugString(),
-            ", In[1]: ", weight_tf_shape.DebugString()));
+    OP_REQUIRES(ctx, k == weight_tf_shape.dim_size(dim_pair[1]),
+                errors::InvalidArgument("Matrix size-incompatible: In[0]: ",
+                                        src_tf_shape.DebugString(), ", In[1]: ",
+                                        weight_tf_shape.DebugString()));
     OP_REQUIRES(ctx, bias_tensor.shape().dim_size(0) == channel,
                 errors::InvalidArgument(
                     "Must provide as many biases as the channel size: ",
@@ -106,8 +107,12 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T> {
     memory::format weight_format =
         transpose_b_ ? memory::format::oi : memory::format::io;
 
-    MklDnnMatMulFwdParams matmul_params(src_dims, weight_dims, bias_dims,
-                                        dst_dims, weight_format);
+    // Set weight format for primitive:
+    //   1. const, let MKL-DNN determine format because it will be cached;
+    //   2. var, keep the original format to avoid reordering.
+    MklDnnMatMulFwdParams matmul_params(
+        src_dims, weight_dims, bias_dims, dst_dims,
+        (this->is_weight_const_) ? memory::format::any : weight_format);
 
     // Extend the basic parameters for data types and fusions.
     ExtendMklDnnMatMulFwdParams(ctx, matmul_params);
@@ -119,7 +124,7 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T> {
     std::shared_ptr<mkldnn::inner_product_forward::primitive_desc> matmul_pd =
         matmul_prim->GetPrimitiveDesc();
 
-    if (src_mkl_shape.IsMklTensor() && weight_mkl_shape.IsMklTensor()) {
+    if (src_mkl_shape.IsMklTensor()) {
       this->AllocateOutputTensor(ctx, *matmul_pd, dst_dims, memory::format::nc,
                                  &dst_tensor);
     } else {
@@ -142,7 +147,7 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T> {
       T* bias_data = const_cast<T*>(bias_tensor.flat<T>().data());
       T* dst_data = const_cast<T*>(dst_tensor->flat<T>().data());
 
-      // Any input is MKL format, reorder it if necessary.
+      // Reorder input if necessary.
       MklDnnData<T> src_mkl(&(this->cpu_engine_));
       MklDnnData<T> weight_mkl(&(this->cpu_engine_));
 
@@ -156,10 +161,28 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T> {
         }
       }
 
-      if (weight_mkl_shape.IsMklTensor()) {
-        memory::desc input_md = weight_mkl_shape.GetMklLayout();
+      // Get cached data when weight is const.
+      memory::format expected_format = matmul_prim->GetweightMemoryFormat();
+      DCHECK(expected_format != weight_format && this->is_weight_const_);
+      if (this->is_weight_const_) {
+        T* cached_weight_data = nullptr;
+        if (this->IsWeightCacheEmpty(ctx)) {
+          auto weight_md =
+              memory::desc(weight_dims, MklDnnType<T>(), weight_format);
+          this->CacheWeight(ctx, matmul_pd, cached_weight_data, weight_tensor,
+                            weight_mkl, weight_md);
+        }
+        cached_weight_data = this->GetCachedWeight(ctx, expected_format);
+
+        // Cache weight may fail when it gets different format in different
+        // iteration. Fallback to reoder if it happens.
+        // TODO: Fix this slow path.
+        if (cached_weight_data != nullptr) {
+          weight_data = cached_weight_data;
+        } else {
+          memory::desc input_md =
+              memory::desc(weight_dims, MklDnnType<T>(), weight_format);
 
-        if (input_md.data.format != weight_format) {
           weight_mkl.SetUsrMem(input_md, weight_data);
           weight_mkl.CheckReorderToOpMem(
               matmul_pd.get()->weights_primitive_desc());
@@ -170,9 +193,9 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T> {
 
       matmul_prim->Execute(src_data, weight_data, bias_data, dst_data);
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           ctx, errors::Aborted("Operation received an exception:", error_msg));
     }
diff --git a/tensorflow/core/kernels/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl_matmul_ops_common.h
index f7666d59883..f80579b8bef 100644
--- a/tensorflow/core/kernels/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl_matmul_ops_common.h
@@ -343,7 +343,7 @@ class MklDnnMatMulFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
   }
 };
 
-template <class Toutput>
+template <class Tweight, class Toutput>
 class MklDnnMatMulOpBase : public OpKernel {
  public:
   explicit MklDnnMatMulOpBase(OpKernelConstruction* context)
@@ -374,9 +374,90 @@ class MklDnnMatMulOpBase : public OpKernel {
                               output_tf_shape, output_mkl_shape);
   }
 
+  // LOCKS_EXCLUDED annotation ensures that the lock (mu_) cannot
+  // be acquired before entering the function, since it is acquired
+  // inside the function.
+  inline bool IsWeightCacheEmpty(OpKernelContext* context) LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock lock(mu_);
+    return (weight_oi.NumElements() == 0);
+  }
+
+  // Cache the converted weight in a persistent tensor.
+  // Only one thread can execute this method at any given time.
+  void CacheWeight(
+      OpKernelContext* context,
+      const std::shared_ptr<mkldnn::inner_product_forward::primitive_desc>&
+          matmul_fwd_pd,
+      Tweight* weight_data, const Tensor& weight_tensor,
+      MklDnnData<Tweight>& weight, const memory::desc& weight_md)
+      LOCKS_EXCLUDED(mu_) {
+    mutex_lock lock(mu_);
+    const Tensor& weight_t = *weight_oi.AccessTensor(context);
+
+    // if the weights are already cahced, there's nothing to do
+    if (weight_t.NumElements() > 0) {
+      return;
+    }
+
+    // reorder and cache the weight
+    weight.SetUsrMem(weight_md, &weight_tensor);
+    weight.CheckReorderToOpMem(matmul_fwd_pd.get()->weights_primitive_desc());
+    weight_data = static_cast<Tweight*>(weight.GetOpMem().get_data_handle());
+
+    Tensor* weight_tensor_ptr = nullptr;
+
+    TensorShape weight_tf_shape;
+    weight_tf_shape.AddDim(
+        (matmul_fwd_pd.get()->weights_primitive_desc().get_size() /
+         sizeof(Tweight)));
+
+    OP_REQUIRES_OK(context, context->allocate_persistent(
+                                DataTypeToEnum<Tweight>::value, weight_tf_shape,
+                                &weight_oi, &weight_tensor_ptr));
+
+    void* weight_oi_t_data = weight.GetTensorBuffer(weight_tensor_ptr);
+    size_t weight_size = weight.GetOpMem().get_primitive_desc().get_size();
+    memcpy(weight_oi_t_data, weight_data, weight_size);
+
+    // cache the memory descriptor
+    Tensor* weight_md_tensor_ptr = nullptr;
+    TensorShape weight_mkl_format;
+    weight_mkl_format.AddDim(1);
+
+    OP_REQUIRES_OK(context, context->allocate_persistent(
+                                DT_INT32, weight_mkl_format, &weight_oi_md,
+                                &weight_md_tensor_ptr));
+    weight_md_tensor_ptr->scalar<int32>()() =
+        matmul_fwd_pd.get()->weights_primitive_desc().desc().data.format;
+  }
+
+  Tweight* GetCachedWeight(OpKernelContext* context,
+                           const memory::format& weight_mf)
+      LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock lock(mu_);
+    const Tensor& weight_t = *weight_oi.AccessTensor(context);
+    const Tensor& weight_md_t = *weight_oi_md.AccessTensor(context);
+
+    // Check if the  memory descriptor of the cached weight is same as
+    // weight_mf. if so use the cached memory, else return NULL
+    if (weight_md_t.scalar<int32>().size() &&
+        weight_md_t.scalar<int32>()() == weight_mf) {
+      return static_cast<Tweight*>(
+          const_cast<Tweight*>(weight_t.flat<Tweight>().data()));
+    }
+    return nullptr;
+  }
+
   engine cpu_engine_ = engine(engine::cpu, 0);
 
  protected:
+  // Tensor to save reordered weight
+  mutex mu_;
+  PersistentTensor weight_oi GUARDED_BY(mu_);
+  PersistentTensor weight_oi_md GUARDED_BY(mu_);
+
+  bool is_weight_const_;
+
   const int kInputIndexSrc = 0;
   const int kInputIndexWeight = 1;
   const int kInputIndexBias = 2;
diff --git a/tensorflow/core/kernels/mkl_qmatmul_op.cc b/tensorflow/core/kernels/mkl_qmatmul_op.cc
index f9f199547ed..12ea643b607 100644
--- a/tensorflow/core/kernels/mkl_qmatmul_op.cc
+++ b/tensorflow/core/kernels/mkl_qmatmul_op.cc
@@ -109,7 +109,7 @@ namespace tensorflow {
 
 template <typename Device, typename Tinput, typename Tweight, typename Tbias,
           typename Toutput>
-class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Toutput> {
+class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Tweight, Toutput> {
  public:
   virtual ~MklDnnQuantizedMatMulOp() {
     if (this->input_bias_ != nullptr) {
@@ -134,7 +134,7 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Toutput> {
   }
 
   explicit MklDnnQuantizedMatMulOp(OpKernelConstruction* context)
-      : MklDnnMatMulOpBase<Toutput>(context) {
+      : MklDnnMatMulOpBase<Tweight, Toutput>(context) {
     string mode_string;
     OP_REQUIRES_OK(context, context->GetAttr("input_quant_mode", &mode_string));
     if (mode_string == "MIN_FIRST") {
@@ -146,10 +146,10 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Toutput> {
           "Quantization mode must be either MIN_FIRST or SCALED, but received ",
           mode_string));
     }
-    is_weight_const_ = false;
+    this->is_weight_const_ = false;
     if (context->HasAttr("is_weight_const")) {
-      OP_REQUIRES_OK(context,
-                     context->GetAttr("is_weight_const", &is_weight_const_));
+      OP_REQUIRES_OK(context, context->GetAttr("is_weight_const",
+                                               &(this->is_weight_const_)));
     }
   }
 
@@ -258,15 +258,15 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Toutput> {
         // TF default format is IO. So in that case convert weight from IO
         // to OI for the first iteration and cache it to reuse in the
         // subsequent iterations, if the weight is constant.
-        if (is_weight_const_) {
+        if (this->is_weight_const_) {
           // Check if the weight is already cached or not
-          if (IsWeightCacheEmpty(context)) {
+          if (this->IsWeightCacheEmpty(context)) {
             // Cache weight if it is not cached.
-            CacheWeight(context, matmul_fwd_pd, weight_data, weight_tensor,
-                        weight, weight_md);
+            this->CacheWeight(context, matmul_fwd_pd, weight_data,
+                              weight_tensor, weight, weight_md);
           }
-          weight_data =
-              GetCachedWeight(context, matmul_fwd->GetweightMemoryFormat());
+          weight_data = this->GetCachedWeight(
+              context, matmul_fwd->GetweightMemoryFormat());
           is_weight_cached = (weight_data != nullptr);
         }
 
@@ -461,87 +461,8 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Toutput> {
 
   // Buffer to save the compensated bias
   float* comp_bias_ = nullptr;
-  // Tensor to save reordered weight
-  mutex mu_;
-  PersistentTensor weight_oi GUARDED_BY(mu_);
-  PersistentTensor weight_oi_md GUARDED_BY(mu_);
 
   int mode_;
-  bool is_weight_const_;
-  // LOCKS_EXCLUDED annotation ensures that the lock (mu_) cannot
-  // be acquired before entering the function, since it is acquired
-  // inside the function.
-  inline bool IsWeightCacheEmpty(OpKernelContext* context) LOCKS_EXCLUDED(mu_) {
-    tf_shared_lock lock(mu_);
-    return (weight_oi.NumElements() == 0);
-  }
-
-  // Cache the converted weight in a persistent tensor.
-  // Only one thread can execute this method at any given time.
-  void CacheWeight(
-      OpKernelContext* context,
-      const std::shared_ptr<mkldnn::inner_product_forward::primitive_desc>&
-          matmul_fwd_pd,
-      Tweight* weight_data, const Tensor& weight_tensor,
-      MklDnnData<Tweight>& weight, const memory::desc& weight_md)
-      LOCKS_EXCLUDED(mu_) {
-    mutex_lock lock(mu_);
-    const Tensor& weight_t = *weight_oi.AccessTensor(context);
-
-    // If the weights are already cahced, there's nothing to do
-    if (weight_t.NumElements() > 0) {
-      return;
-    }
-
-    // Reorder and cache the weight
-    weight.SetUsrMem(weight_md, &weight_tensor);
-    weight.CheckReorderToOpMem(matmul_fwd_pd.get()->weights_primitive_desc());
-    weight_data = static_cast<Tweight*>(weight.GetOpMem().get_data_handle());
-
-    Tensor* weight_tensor_ptr = nullptr;
-
-    TensorShape weight_tf_shape;
-    weight_tf_shape.AddDim(
-        (matmul_fwd_pd.get()->weights_primitive_desc().get_size() /
-         sizeof(Tweight)));
-
-    OP_REQUIRES_OK(context, context->allocate_persistent(
-                                DataTypeToEnum<Tweight>::value, weight_tf_shape,
-                                &weight_oi, &weight_tensor_ptr));
-
-    void* weight_oi_t_data = weight.GetTensorBuffer(weight_tensor_ptr);
-    size_t weight_size = weight.GetOpMem().get_primitive_desc().get_size();
-    memcpy(weight_oi_t_data, weight_data, weight_size);
-
-    // Cache the memory descriptor
-    Tensor* weight_md_tensor_ptr = nullptr;
-    TensorShape weight_mkl_format;
-
-    weight_mkl_format.AddDim(1);
-
-    OP_REQUIRES_OK(context, context->allocate_persistent(
-                                DT_INT32, weight_mkl_format, &weight_oi_md,
-                                &weight_md_tensor_ptr));
-    weight_md_tensor_ptr->scalar<int32>()() =
-        matmul_fwd_pd.get()->weights_primitive_desc().desc().data.format;
-  }
-
-  Tweight* GetCachedWeight(OpKernelContext* context,
-                           const memory::format& weight_mf)
-      LOCKS_EXCLUDED(mu_) {
-    tf_shared_lock lock(mu_);
-    const Tensor& weight_t = *weight_oi.AccessTensor(context);
-    const Tensor& weight_md_t = *weight_oi_md.AccessTensor(context);
-
-    // Check if the  memory descriptor of the cached weight is same as
-    // weight_mf. If so use the cached memory, else return NULL
-    if ((weight_md_t.scalar<int32>().size() > 0) &&
-        weight_md_t.scalar<int32>()() == weight_mf) {
-      return static_cast<Tweight*>(
-          const_cast<Tweight*>(weight_t.flat<Tweight>().data()));
-    }
-    return nullptr;
-  }
 };
 
 template <typename Device, typename Tinput, typename Tweight, typename Tbias,
diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc
index 4da1ba127dc..4d6a105fee7 100644
--- a/tensorflow/core/ops/mkl_nn_ops.cc
+++ b/tensorflow/core/ops/mkl_nn_ops.cc
@@ -69,6 +69,7 @@ REGISTER_OP("_MklFusedMatMul")
     .Input("mkl_args: num_args * uint8")
     .Output("product: T")
     .Output("mkl_product: uint8")
+    .Attr("is_filter_const: bool = false")
     .Attr("transpose_a: bool = false")
     .Attr("transpose_b: bool = false")
     .Attr("T: {float}")

From eefeac0e116b02dbf2c8de171d5ad9fcb700fccb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= <mans.nilsson@arm.com>
Date: Wed, 22 Jan 2020 10:32:04 +0100
Subject: [PATCH 022/442] TFLu: Update stm32f4 target

Add stm32f4 specific renode script files, instead of reusing the
bluepill files.
---
 .../lite/micro/testing/Dockerfile.bluepill    |  2 +-
 .../lite/micro/testing/Dockerfile.stm32f4     | 21 +++++++
 tensorflow/lite/micro/testing/bluepill.resc   |  2 +-
 tensorflow/lite/micro/testing/stm32f4.resc    | 33 ++++++++++
 tensorflow/lite/micro/testing/stm32f4.robot   | 23 +++++++
 .../lite/micro/testing/test_stm32f4_binary.sh | 60 +++++++++++++++++++
 .../tools/make/targets/stm32f4_makefile.inc   |  3 +-
 7 files changed, 140 insertions(+), 4 deletions(-)
 create mode 100644 tensorflow/lite/micro/testing/Dockerfile.stm32f4
 create mode 100644 tensorflow/lite/micro/testing/stm32f4.resc
 create mode 100644 tensorflow/lite/micro/testing/stm32f4.robot
 create mode 100755 tensorflow/lite/micro/testing/test_stm32f4_binary.sh

diff --git a/tensorflow/lite/micro/testing/Dockerfile.bluepill b/tensorflow/lite/micro/testing/Dockerfile.bluepill
index 7d6d81af0f4..330d8457b3e 100644
--- a/tensorflow/lite/micro/testing/Dockerfile.bluepill
+++ b/tensorflow/lite/micro/testing/Dockerfile.bluepill
@@ -1,4 +1,4 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tensorflow/lite/micro/testing/Dockerfile.stm32f4 b/tensorflow/lite/micro/testing/Dockerfile.stm32f4
new file mode 100644
index 00000000000..75e6118c5ef
--- /dev/null
+++ b/tensorflow/lite/micro/testing/Dockerfile.stm32f4
@@ -0,0 +1,21 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# This docker configuration file lets you emulate a stm32f4 board
+# on an x86 desktop or laptop, which can be useful for debugging and
+# automated testing.
+FROM antmicro/renode:latest
+
+LABEL maintainer="Pete Warden <petewarden@google.com>"
\ No newline at end of file
diff --git a/tensorflow/lite/micro/testing/bluepill.resc b/tensorflow/lite/micro/testing/bluepill.resc
index c46b33e3fb0..9cc9dcd9f79 100644
--- a/tensorflow/lite/micro/testing/bluepill.resc
+++ b/tensorflow/lite/micro/testing/bluepill.resc
@@ -1,4 +1,4 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tensorflow/lite/micro/testing/stm32f4.resc b/tensorflow/lite/micro/testing/stm32f4.resc
new file mode 100644
index 00000000000..45f213c22b1
--- /dev/null
+++ b/tensorflow/lite/micro/testing/stm32f4.resc
@@ -0,0 +1,33 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+using sysbus
+
+mach create
+machine LoadPlatformDescription @platforms/cpus/stm32f4.repl
+
+# These lines are needed to show the results of DebugLog calls in the output.
+machine LoadPlatformDescriptionFromString "uartSemihosting: UART.SemihostingUart @ cpu"
+showAnalyzer cpu.uartSemihosting Antmicro.Renode.Analyzers.LoggingUartAnalyzer
+
+logFile @/tmp/renode_stm32f4_log.txt
+
+macro reset
+"""
+    sysbus LoadELF $bin
+"""
+
+runMacro $reset
+
diff --git a/tensorflow/lite/micro/testing/stm32f4.robot b/tensorflow/lite/micro/testing/stm32f4.robot
new file mode 100644
index 00000000000..d1d204f51e9
--- /dev/null
+++ b/tensorflow/lite/micro/testing/stm32f4.robot
@@ -0,0 +1,23 @@
+*** Settings ***
+Suite Setup                   Setup
+Suite Teardown                Teardown
+Test Setup                    Reset Emulation
+Resource                      /opt/renode/tests/renode-keywords.robot
+
+*** Variables ***
+${UART}                       sysbus.cpu.uartSemihosting
+
+*** Test Cases ***
+Should Run Stm32f4 Test
+    [Documentation]           Runs a Stm32f4 test and waits for a specific string on the semihosting UART
+    [Tags]                    stm32f4  uart  tensorflow  arm
+    ${BIN} =                  Get Environment Variable    BIN
+    ${SCRIPT} =               Get Environment Variable    SCRIPT
+    ${EXPECTED} =             Get Environment Variable    EXPECTED
+    Execute Command           $bin = @${BIN}
+    Execute Script            ${SCRIPT}
+
+    Create Terminal Tester    ${UART}  timeout=30
+    Start Emulation
+
+    Wait For Line On Uart     ${EXPECTED}
diff --git a/tensorflow/lite/micro/testing/test_stm32f4_binary.sh b/tensorflow/lite/micro/testing/test_stm32f4_binary.sh
new file mode 100755
index 00000000000..de7d7492260
--- /dev/null
+++ b/tensorflow/lite/micro/testing/test_stm32f4_binary.sh
@@ -0,0 +1,60 @@
+#!/bin/bash -e
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Tests a 'stm32f4' STM32F4 ELF by parsing the log output of Renode emulation.
+#
+# First argument is the ELF location.
+# Second argument is a regular expression that's required to be in the output logs
+# for the test to pass.
+#
+# This script must be run from the top-level folder of the tensorflow github
+# repository as it mounts `pwd` to the renode docker image (via docker run -v)
+# and paths in the docker run command assume the entire tensorflow repo is mounted.
+
+declare -r ROOT_DIR=`pwd`
+declare -r TEST_TMPDIR=/tmp/test_stm32f4_binary/
+declare -r MICRO_LOG_PATH=${TEST_TMPDIR}
+declare -r MICRO_LOG_FILENAME=${MICRO_LOG_PATH}/logs.txt
+mkdir -p ${MICRO_LOG_PATH}
+
+docker build -t renode_stm32f4 \
+  -f ${ROOT_DIR}/tensorflow/lite/micro/testing/Dockerfile.stm32f4 \
+  ${ROOT_DIR}/tensorflow/lite/micro/testing/
+
+exit_code=0
+# running in `if` to avoid setting +e
+if ! docker run \
+  --log-driver=none -a stdout -a stderr \
+  -v ${ROOT_DIR}:/workspace \
+  -v /tmp:/tmp \
+  -e BIN=/workspace/$1 \
+  -e SCRIPT=/workspace/tensorflow/lite/micro/testing/stm32f4.resc \
+  -e EXPECTED="$2" \
+  -it renode_stm32f4 \
+  /bin/bash -c "/opt/renode/tests/test.sh /workspace/tensorflow/lite/micro/testing/stm32f4.robot 2>&1 >${MICRO_LOG_FILENAME}"
+then
+  exit_code=1
+fi
+
+echo "LOGS:"
+cat ${MICRO_LOG_FILENAME}
+if [ $exit_code -eq 0 ]
+then
+  echo "$1: PASS"
+else
+  echo "$1: FAIL - '$2' not found in logs."
+fi
+exit $exit_code
diff --git a/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc b/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
index b99e11e0328..4df3e755934 100644
--- a/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
@@ -55,8 +55,7 @@ ifeq ($(TARGET), stm32f4)
   EXCLUDED_SRCS := \
     $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/debug_log.c
   MICROLITE_CC_SRCS := $(filter-out $(EXCLUDED_SRCS), $(MICROLITE_CC_SRCS))
-  # Stm32f4 is reusing the bluepill renode scripts for now
-  TEST_SCRIPT := tensorflow/lite/micro/testing/test_bluepill_binary.sh
+  TEST_SCRIPT := tensorflow/lite/micro/testing/test_stm32f4_binary.sh
   # TODO, non working tests.. the micro_speech example and conv_test.cc/depthwise_conv_test.cc partly works
   EXCLUDED_TESTS := \
     tensorflow/lite/micro/micro_interpreter_test.cc \

From 56d0e95efef32c6851b3ed2510542c224857ec0e Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Thu, 23 Jan 2020 18:39:49 +0530
Subject: [PATCH 023/442] Use LazyLoader to import network in save.py

---
 tensorflow/python/keras/saving/save.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py
index 8ab516bc8a2..35e72c97956 100644
--- a/tensorflow/python/keras/saving/save.py
+++ b/tensorflow/python/keras/saving/save.py
@@ -22,13 +22,17 @@ import sys
 
 import six
 
-from tensorflow.python.keras.engine import network
 from tensorflow.python.keras.saving import hdf5_format
 from tensorflow.python.keras.saving.saved_model import load as saved_model_load
 from tensorflow.python.keras.saving.saved_model import save as saved_model_save
 from tensorflow.python.saved_model import loader_impl
+from tensroflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import keras_export
 
+network = LazyLoader(
+    'network', globals(),
+    'tensroflow.python.keras.engine.network')
+
 # pylint: disable=g-import-not-at-top
 if sys.version_info >= (3, 4):
   import pathlib

From 8fbd517aa3c23b98300bd1970af627c00e4c02b6 Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Fri, 24 Jan 2020 11:10:33 +0530
Subject: [PATCH 024/442] Fix typo

---
 tensorflow/python/keras/saving/save.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py
index 35e72c97956..2f8613c2c60 100644
--- a/tensorflow/python/keras/saving/save.py
+++ b/tensorflow/python/keras/saving/save.py
@@ -26,7 +26,7 @@ from tensorflow.python.keras.saving import hdf5_format
 from tensorflow.python.keras.saving.saved_model import load as saved_model_load
 from tensorflow.python.keras.saving.saved_model import save as saved_model_save
 from tensorflow.python.saved_model import loader_impl
-from tensroflow.python.util.lazy_loader import LazyLoader
+from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import keras_export
 
 network = LazyLoader(

From 3b8e7c05a08277d5fd534c1e535321bfa0817e9d Mon Sep 17 00:00:00 2001
From: Puneeth K <puneethk.2899@gmail.com>
Date: Fri, 24 Jan 2020 23:02:03 +0530
Subject: [PATCH 025/442] Update the documentation

---
 tensorflow/python/util/nest.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 6187e325001..847a7687c61 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -260,8 +260,9 @@ def flatten(structure, expand_composites=False):
   running.
 
   Args:
-    structure: an arbitrarily nested structure or a scalar object. Note, numpy
-      arrays are considered scalars.
+    structure: an arbitrarily nested structure which can be a scalar, or
+      tuple or dict or list of constructed scalars and/or other tuples/lists, or
+      a scalar object. Note, numpy arrays are considered scalars.
     expand_composites: If true, then composite tensors such as tf.SparseTensor
        and tf.RaggedTensor are expanded into their component tensors.
 
@@ -306,8 +307,14 @@ def assert_same_structure(nest1, nest2, check_types=True,
   ```
 
   Args:
-    nest1: an arbitrarily nested structure.
-    nest2: an arbitrarily nested structure.
+    nest1: an arbitrarily nested structure which can be a scalar, or
+      tuple or dict or list of constructed scalars and/or other 
+      tuples/lists, or a scalar object. Note, numpy arrays are considered
+      scalars.
+    nest2: an arbitrarily nested structure which can be a scalar, or
+      tuple or dict or list of constructed scalars and/or other 
+      tuples/lists, or a scalar object. Note, numpy arrays are considered
+      scalars.
     check_types: if `True` (default) types of sequences are checked as well,
         including the keys of dictionaries. If set to `False`, for example a
         list and a tuple of objects will look the same if they have the same
@@ -514,7 +521,7 @@ def map_structure(func, *structure, **kwargs):
 
   Args:
     func: A callable that accepts as many arguments as there are structures.
-    *structure: scalar, or tuple or list of constructed scalars and/or other
+    *structure: scalar, or tuple or dict or list of constructed scalars and/or other
       tuples/lists, or scalars.  Note: numpy arrays are considered as scalars.
     **kwargs: Valid keyword args are:
 

From 7cff0c65361d19007491aee487665197d908da09 Mon Sep 17 00:00:00 2001
From: Puneeth K <puneethk.2899@gmail.com>
Date: Sat, 25 Jan 2020 10:39:21 +0530
Subject: [PATCH 026/442] Update Line 246

---
 tensorflow/python/util/nest.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 847a7687c61..01b4ab5876e 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -243,8 +243,11 @@ def is_nested(seq):
 def flatten(structure, expand_composites=False):
   """Returns a flat list from a given nested structure.
 
-  If nest is not a sequence, tuple (or a namedtuple), dict, or an attrs class,
-  then returns a single-element list:
+  If nest is not a structure (which can be a scalar, or
+      tuple or dict or list of constructed scalars and/or other tuples/lists,
+      or a scalar object. Note, numpy arrays are considered scalars.), tuple 
+      (or a namedtuple), dict, or an attrs class, then returns a single-element 
+      list:
     [nest].
 
   In the case of dict instances, the sequence consists of the values, sorted by

From d8b5dab4c648d5a0f66c325b42c34745cc631a2a Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Sat, 25 Jan 2020 21:16:36 +0530
Subject: [PATCH 027/442] Fix typo

---
 tensorflow/python/keras/saving/save.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py
index 2f8613c2c60..cb94f336408 100644
--- a/tensorflow/python/keras/saving/save.py
+++ b/tensorflow/python/keras/saving/save.py
@@ -31,7 +31,7 @@ from tensorflow.python.util.tf_export import keras_export
 
 network = LazyLoader(
     'network', globals(),
-    'tensroflow.python.keras.engine.network')
+    'tensorflow.python.keras.engine.network')
 
 # pylint: disable=g-import-not-at-top
 if sys.version_info >= (3, 4):

From f6ece2169e725f2ec74231cf03b2fa3ee17376fe Mon Sep 17 00:00:00 2001
From: Rasul Karimov <yoatom96@hotmail.com>
Date: Sun, 26 Jan 2020 01:53:45 +0300
Subject: [PATCH 028/442] add converter for SparseSoftmaxCrossEntropyWithLogits

---
 .../ops/parallel_for/control_flow_ops_test.py      | 14 ++++++++++++++
 tensorflow/python/ops/parallel_for/pfor.py         |  1 +
 2 files changed, 15 insertions(+)

diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 9bc859fb032..929908b96ce 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -532,6 +532,20 @@ class NNTest(PForTestCase):
 
     self._test_loop_fn(loop_fn, 3)
 
+  def test_sparse_softmax_cross_entropy_with_logits(self):
+    logits = random_ops.random_uniform([3, 2, 4])
+    labels = random_ops.random_uniform(shape=[3, 2], maxval=4, dtype=dtypes.int32)
+
+    def loop_fn(i):
+      logits_i = array_ops.gather(logits, i)
+      labels_i = array_ops.gather(labels, i)
+      loss = nn.softmax_cross_entropy_with_logits(
+        labels=labels_i, logits=logits_i)
+      total_loss = math_ops.reduce_sum(loss)
+      return loss
+
+    self._test_loop_fn(loop_fn, 3)
+
 
 class RandomTest(PForTestCase):
 
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index c6caf2b7f17..c7c9e6db95b 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -1587,6 +1587,7 @@ def _inputs_with_flattening(pfor_input, input_indices):
 @RegisterPForWithArgs("MaxPool3DGradGrad", dims=[0, 1, 2])
 @RegisterPForWithArgs("MaxPoolGradGrad", dims=[0, 1, 2])
 @RegisterPForWithArgs("SoftmaxCrossEntropyWithLogits", dims=[0, 1])
+@RegisterPForWithArgs("SparseSoftmaxCrossEntropyWithLogits", dims=[0, 1])
 def _convert_flatten_batch(pfor_input, op_type, dims):
   del op_type
   inputs = _inputs_with_flattening(pfor_input, dims)

From 55ac90809cc04603fc4f7a66cfe9cc746fd6fcc7 Mon Sep 17 00:00:00 2001
From: Puneeth K <puneethk.2899@gmail.com>
Date: Sun, 26 Jan 2020 11:55:29 +0530
Subject: [PATCH 029/442] Defined structure in module overview

---
 tensorflow/python/util/nest.py | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 01b4ab5876e..3c88f52d095 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -19,6 +19,10 @@ This module can perform operations on nested structures. A nested structure is a
 Python sequence, tuple (including `namedtuple`), or dict that can contain
 further sequences, tuples, and dicts.
 
+Structures are scalar, or tuple or dict or list of constructed scalars and/or
+other tuples/lists, or a scalar object. Note, numpy arrays are considered 
+scalars.
+
 attr.s decorated classes (http://www.attrs.org) are also supported, in the
 same way as `namedtuple`.
 
@@ -243,11 +247,8 @@ def is_nested(seq):
 def flatten(structure, expand_composites=False):
   """Returns a flat list from a given nested structure.
 
-  If nest is not a structure (which can be a scalar, or
-      tuple or dict or list of constructed scalars and/or other tuples/lists,
-      or a scalar object. Note, numpy arrays are considered scalars.), tuple 
-      (or a namedtuple), dict, or an attrs class, then returns a single-element 
-      list:
+  If nest is not a structure , tuple (or a namedtuple), dict, or an attrs class,
+  then returns a single-element list:
     [nest].
 
   In the case of dict instances, the sequence consists of the values, sorted by
@@ -263,9 +264,7 @@ def flatten(structure, expand_composites=False):
   running.
 
   Args:
-    structure: an arbitrarily nested structure which can be a scalar, or
-      tuple or dict or list of constructed scalars and/or other tuples/lists, or
-      a scalar object. Note, numpy arrays are considered scalars.
+    structure: an arbitrarily nested structure.
     expand_composites: If true, then composite tensors such as tf.SparseTensor
        and tf.RaggedTensor are expanded into their component tensors.
 
@@ -310,14 +309,8 @@ def assert_same_structure(nest1, nest2, check_types=True,
   ```
 
   Args:
-    nest1: an arbitrarily nested structure which can be a scalar, or
-      tuple or dict or list of constructed scalars and/or other 
-      tuples/lists, or a scalar object. Note, numpy arrays are considered
-      scalars.
-    nest2: an arbitrarily nested structure which can be a scalar, or
-      tuple or dict or list of constructed scalars and/or other 
-      tuples/lists, or a scalar object. Note, numpy arrays are considered
-      scalars.
+    nest1: an arbitrarily nested structure.
+    nest2: an arbitrarily nested structure.
     check_types: if `True` (default) types of sequences are checked as well,
         including the keys of dictionaries. If set to `False`, for example a
         list and a tuple of objects will look the same if they have the same

From e585cc8b696733b1a8467e6d99e36a25c926d3aa Mon Sep 17 00:00:00 2001
From: Puneeth K <puneethk.2899@gmail.com>
Date: Sun, 26 Jan 2020 12:00:03 +0530
Subject: [PATCH 030/442] Changed scalar to atom

---
 tensorflow/python/util/nest.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 3c88f52d095..4848a2336ae 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -19,9 +19,9 @@ This module can perform operations on nested structures. A nested structure is a
 Python sequence, tuple (including `namedtuple`), or dict that can contain
 further sequences, tuples, and dicts.
 
-Structures are scalar, or tuple or dict or list of constructed scalars and/or
-other tuples/lists, or a scalar object. Note, numpy arrays are considered 
-scalars.
+Structures are atom, or tuple or dict or list of constructed atoms and/or
+other tuples/lists, or an atom object. Note, numpy arrays are considered 
+atoms.
 
 attr.s decorated classes (http://www.attrs.org) are also supported, in the
 same way as `namedtuple`.
@@ -264,7 +264,8 @@ def flatten(structure, expand_composites=False):
   running.
 
   Args:
-    structure: an arbitrarily nested structure.
+    structure: an arbitrarily nested structure. Note, numpy arrays are considered
+      atoms and are not flattened.
     expand_composites: If true, then composite tensors such as tf.SparseTensor
        and tf.RaggedTensor are expanded into their component tensors.
 

From acf7733f2c4fbaf7773ec50ecdb68a2030d5baf8 Mon Sep 17 00:00:00 2001
From: Puneeth K <puneethk.2899@gmail.com>
Date: Sun, 26 Jan 2020 12:27:08 +0530
Subject: [PATCH 031/442] Change IsMappingHelper to
 IsNestCompatibleMappingHelper

---
 tensorflow/python/util/util.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index d1e43c92164..6daa378a9f7 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -213,9 +213,9 @@ int IsInstanceOfRegisteredType(PyObject* obj, const char* type_name) {
 // Returns 1 if `o` is considered a mapping for the purposes of Flatten().
 // Returns 0 otherwise.
 // Returns -1 if an error occurred.
-int IsMappingHelper(PyObject* o) {
+int IsNestCompatibleMappingHelper(PyObject* o) {
   static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
-    return IsInstanceOfRegisteredType(to_check, "Mapping");
+    return IsInstanceOfRegisteredType(to_check, "MutableMapping");
   });
   if (PyDict_Check(o)) return true;
   return check_cache->CachedLookup(o);

From 0d36503c13732c37163a5349107f06fb43f3ccf0 Mon Sep 17 00:00:00 2001
From: Puneeth K <puneethk.2899@gmail.com>
Date: Mon, 27 Jan 2020 20:19:37 +0530
Subject: [PATCH 032/442] Updated module overview

---
 tensorflow/python/util/nest.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 4848a2336ae..a39d6190e2b 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -16,12 +16,16 @@
 """## Functions for working with arbitrarily nested sequences of elements.
 
 This module can perform operations on nested structures. A nested structure is a
-Python sequence, tuple (including `namedtuple`), or dict that can contain
-further sequences, tuples, and dicts.
+Python collection that can contain further collections as well as other objects
+called atoms. Note that numpy arrays are considered atoms.
 
-Structures are atom, or tuple or dict or list of constructed atoms and/or
-other tuples/lists, or an atom object. Note, numpy arrays are considered 
-atoms.
+nest recognizes the following types of collections:
+  1.tuple
+  2.namedtuple
+  3.dict
+  4.orderedDict
+  5.MutableMapping
+  6.attr.s
 
 attr.s decorated classes (http://www.attrs.org) are also supported, in the
 same way as `namedtuple`.

From fe9a5451fd56b5e80bb489167745af7325ede138 Mon Sep 17 00:00:00 2001
From: Puneeth K <puneethk.2899@gmail.com>
Date: Mon, 27 Jan 2020 20:32:52 +0530
Subject: [PATCH 033/442] Updated util.cc

---
 tensorflow/python/util/util.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index 6daa378a9f7..aa02b33e4c8 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -213,6 +213,14 @@ int IsInstanceOfRegisteredType(PyObject* obj, const char* type_name) {
 // Returns 1 if `o` is considered a mapping for the purposes of Flatten().
 // Returns 0 otherwise.
 // Returns -1 if an error occurred.
+int IsMappingHelper(PyObject* o) {
+  static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
+    return IsInstanceOfRegisteredType(to_check, "Mapping");
+  });
+  if (PyDict_Check(o)) return true;
+  return check_cache->CachedLookup(o);
+}
+
 int IsNestCompatibleMappingHelper(PyObject* o) {
   static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
     return IsInstanceOfRegisteredType(to_check, "MutableMapping");

From dd5e3ca703ef9c38ec73bccda1936bea9569fc75 Mon Sep 17 00:00:00 2001
From: Rasul Karimov <yoatom96@hotmail.com>
Date: Mon, 27 Jan 2020 20:19:54 +0300
Subject: [PATCH 034/442] fix test

---
 tensorflow/python/ops/parallel_for/control_flow_ops_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 929908b96ce..840b9724a62 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -539,7 +539,7 @@ class NNTest(PForTestCase):
     def loop_fn(i):
       logits_i = array_ops.gather(logits, i)
       labels_i = array_ops.gather(labels, i)
-      loss = nn.softmax_cross_entropy_with_logits(
+      loss = nn.sparse_softmax_cross_entropy_with_logits(
         labels=labels_i, logits=logits_i)
       total_loss = math_ops.reduce_sum(loss)
       return loss

From d11e6417c69c0943b3476b36a0ab67ab9e1ac58b Mon Sep 17 00:00:00 2001
From: Rasul Karimov <yoatom96@hotmail.com>
Date: Wed, 29 Jan 2020 04:17:40 +0300
Subject: [PATCH 035/442] fix pylint

---
 tensorflow/python/ops/parallel_for/control_flow_ops_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 840b9724a62..388f9639597 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -541,7 +541,6 @@ class NNTest(PForTestCase):
       labels_i = array_ops.gather(labels, i)
       loss = nn.sparse_softmax_cross_entropy_with_logits(
         labels=labels_i, logits=logits_i)
-      total_loss = math_ops.reduce_sum(loss)
       return loss
 
     self._test_loop_fn(loop_fn, 3)

From 414d61699b5a8bcae21d87946647bfa0dc427ce6 Mon Sep 17 00:00:00 2001
From: Rasul Karimov <yoatom96@hotmail.com>
Date: Wed, 29 Jan 2020 04:26:18 +0300
Subject: [PATCH 036/442] fix pylint (2)

---
 tensorflow/python/ops/parallel_for/control_flow_ops_test.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 388f9639597..862aeff860a 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -534,13 +534,14 @@ class NNTest(PForTestCase):
 
   def test_sparse_softmax_cross_entropy_with_logits(self):
     logits = random_ops.random_uniform([3, 2, 4])
-    labels = random_ops.random_uniform(shape=[3, 2], maxval=4, dtype=dtypes.int32)
+    labels = random_ops.random_uniform(shape=[3, 2], maxval=4,
+                                       dtype=dtypes.int32)
 
     def loop_fn(i):
       logits_i = array_ops.gather(logits, i)
       labels_i = array_ops.gather(labels, i)
       loss = nn.sparse_softmax_cross_entropy_with_logits(
-        labels=labels_i, logits=logits_i)
+          labels=labels_i, logits=logits_i)
       return loss
 
     self._test_loop_fn(loop_fn, 3)

From 72008da70827f5076beed839dbb8099fb9f3a474 Mon Sep 17 00:00:00 2001
From: Alex Hoffman <alxhoff@gmail.com>
Date: Thu, 30 Jan 2020 13:20:57 +0100
Subject: [PATCH 037/442] Fixed inconsistencies between int and int32_t type
 uses

---
 tensorflow/lite/c/c_api.h               | 2 +-
 tensorflow/lite/c/c_api_experimental.cc | 2 +-
 tensorflow/lite/c/c_api_experimental.h  | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/c/c_api.h b/tensorflow/lite/c/c_api.h
index 6c46e92bc53..754fc3b8bbd 100644
--- a/tensorflow/lite/c/c_api.h
+++ b/tensorflow/lite/c/c_api.h
@@ -164,7 +164,7 @@ TFL_CAPI_EXPORT extern void TfLiteInterpreterDelete(
     TfLiteInterpreter* interpreter);
 
 // Returns the number of input tensors associated with the model.
-TFL_CAPI_EXPORT extern int TfLiteInterpreterGetInputTensorCount(
+TFL_CAPI_EXPORT extern int32_t TfLiteInterpreterGetInputTensorCount(
     const TfLiteInterpreter* interpreter);
 
 // Returns the tensor associated with the input index.
diff --git a/tensorflow/lite/c/c_api_experimental.cc b/tensorflow/lite/c/c_api_experimental.cc
index dbf4cd7a175..71637ebe137 100644
--- a/tensorflow/lite/c/c_api_experimental.cc
+++ b/tensorflow/lite/c/c_api_experimental.cc
@@ -45,7 +45,7 @@ void TfLiteInterpreterOptionsAddBuiltinOp(
 void TfLiteInterpreterOptionsAddCustomOp(TfLiteInterpreterOptions* options,
                                          const char* name,
                                          const TfLiteRegistration* registration,
-                                         int min_version, int max_version) {
+                                         int32_t min_version, int32_t max_version) {
   options->op_resolver.AddCustom(name, registration, min_version, max_version);
 }
 
diff --git a/tensorflow/lite/c/c_api_experimental.h b/tensorflow/lite/c/c_api_experimental.h
index bf21e2ee4b5..4a956a103e5 100644
--- a/tensorflow/lite/c/c_api_experimental.h
+++ b/tensorflow/lite/c/c_api_experimental.h
@@ -35,7 +35,7 @@ TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterResetVariableTensors(
 // making the provided TfLiteRegistration instance static.
 TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddBuiltinOp(
     TfLiteInterpreterOptions* options, TfLiteBuiltinOperator op,
-    const TfLiteRegistration* registration, int min_version, int max_version);
+    const TfLiteRegistration* registration, int32_t min_version, int32_t max_version);
 
 // Adds an op registration for a custom operator.
 //
@@ -45,7 +45,7 @@ TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddBuiltinOp(
 // practice is making the provided TfLiteRegistration instance static.
 TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddCustomOp(
     TfLiteInterpreterOptions* options, const char* name,
-    const TfLiteRegistration* registration, int min_version, int max_version);
+    const TfLiteRegistration* registration, int32_t min_version, int32_t max_version);
 
 #ifdef __cplusplus
 }  // extern "C"

From 3eb344aad3dc631bbc967ae408fa5a1a17dabfd8 Mon Sep 17 00:00:00 2001
From: Alex Hoffman <alxhoff@gmail.com>
Date: Fri, 31 Jan 2020 10:59:44 +0100
Subject: [PATCH 038/442] Static casts now fit return type

---
 tensorflow/lite/c/c_api.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/c/c_api.cc b/tensorflow/lite/c/c_api.cc
index 7ceddab4ecf..8fd2ec0d51a 100644
--- a/tensorflow/lite/c/c_api.cc
+++ b/tensorflow/lite/c/c_api.cc
@@ -145,7 +145,7 @@ void TfLiteInterpreterDelete(TfLiteInterpreter* interpreter) {
 
 int32_t TfLiteInterpreterGetInputTensorCount(
     const TfLiteInterpreter* interpreter) {
-  return static_cast<int>(interpreter->impl->inputs().size());
+  return static_cast<int32_t>(interpreter->impl->inputs().size());
 }
 
 TfLiteTensor* TfLiteInterpreterGetInputTensor(
@@ -172,7 +172,7 @@ TfLiteStatus TfLiteInterpreterInvoke(TfLiteInterpreter* interpreter) {
 
 int32_t TfLiteInterpreterGetOutputTensorCount(
     const TfLiteInterpreter* interpreter) {
-  return static_cast<int>(interpreter->impl->outputs().size());
+  return static_cast<int32_t>(interpreter->impl->outputs().size());
 }
 
 const TfLiteTensor* TfLiteInterpreterGetOutputTensor(

From 6759808168d6b98576acfc86a97124a3a418965f Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Fri, 31 Jan 2020 20:28:52 +0000
Subject: [PATCH 039/442] Upgrade aws cpp sdk

---
 tensorflow/core/platform/s3/aws_logging.cc |  2 +
 tensorflow/core/platform/s3/aws_logging.h  |  3 +
 third_party/aws/BUILD.bazel                | 25 +++++++-
 third_party/aws/aws-c-common.bazel         | 67 ++++++++++++++++++++++
 third_party/aws/aws-c-event-stream.bazel   | 35 +++++++++++
 third_party/aws/aws-checksums.bazel        | 35 +++++++++++
 third_party/aws/workspace.bzl              | 42 ++++++++++++--
 7 files changed, 203 insertions(+), 6 deletions(-)
 create mode 100644 third_party/aws/aws-c-common.bazel
 create mode 100644 third_party/aws/aws-c-event-stream.bazel
 create mode 100644 third_party/aws/aws-checksums.bazel

diff --git a/tensorflow/core/platform/s3/aws_logging.cc b/tensorflow/core/platform/s3/aws_logging.cc
index 1d549a2a61e..e0ec94a269f 100644
--- a/tensorflow/core/platform/s3/aws_logging.cc
+++ b/tensorflow/core/platform/s3/aws_logging.cc
@@ -69,6 +69,8 @@ void AWSLogSystem::LogMessage(Aws::Utils::Logging::LogLevel log_level,
   }
 }
 
+void AWSLogSystem::Flush() { return; }
+
 namespace {
 
 // Taken from tensorflow/core/platform/default/logging.cc
diff --git a/tensorflow/core/platform/s3/aws_logging.h b/tensorflow/core/platform/s3/aws_logging.h
index b0da8f3c835..95abf8799de 100644
--- a/tensorflow/core/platform/s3/aws_logging.h
+++ b/tensorflow/core/platform/s3/aws_logging.h
@@ -55,6 +55,9 @@ class AWSLogSystem : public Aws::Utils::Logging::LogSystemInterface {
                          const char* tag,
                          const Aws::OStringStream& messageStream) override;
 
+  // Flushes the buffered messages if the logger supports buffering
+  virtual void Flush() override;
+
  private:
   void LogMessage(Aws::Utils::Logging::LogLevel log_level,
                   const string& message);
diff --git a/third_party/aws/BUILD.bazel b/third_party/aws/BUILD.bazel
index 2f093f72a43..a7a114a1714 100644
--- a/third_party/aws/BUILD.bazel
+++ b/third_party/aws/BUILD.bazel
@@ -44,7 +44,9 @@ cc_library(
         "aws-cpp-sdk-core/source/http/standard/**/*.cpp",
         "aws-cpp-sdk-core/source/utils/*.cpp",
         "aws-cpp-sdk-core/source/utils/base64/**/*.cpp",
+        "aws-cpp-sdk-core/source/utils/event/*.cpp",
         "aws-cpp-sdk-core/source/utils/json/**/*.cpp",
+        "aws-cpp-sdk-core/source/utils/logging/*.cpp",
         "aws-cpp-sdk-core/source/utils/logging/**/*.cpp",
         "aws-cpp-sdk-core/source/utils/memory/**/*.cpp",
         "aws-cpp-sdk-core/source/utils/stream/**/*.cpp",
@@ -54,35 +56,43 @@ cc_library(
         "aws-cpp-sdk-core/source/utils/crypto/factory/**/*.cpp",
         "aws-cpp-sdk-s3/include/**/*.h",
         "aws-cpp-sdk-s3/source/**/*.cpp",
+        "aws-cpp-sdk-core/source/monitoring/*.cpp",
+        "aws-cpp-sdk-core/source/net/linux-shared/*.cpp",
+        "aws-cpp-sdk-core/source/utils/memory/*.cpp",
+        "aws-cpp-sdk-core/source/utils/crypto/openssl/*.cpp",
     ]),
     hdrs = [
         "aws-cpp-sdk-core/include/aws/core/SDKConfig.h",
     ],
     copts = [
         "-DAWS_SDK_VERSION_MAJOR=1",
-        "-DAWS_SDK_VERSION_MINOR=5",
-        "-DAWS_SDK_VERSION_PATCH=8",
+        "-DAWS_SDK_VERSION_MINOR=7",
+        "-DAWS_SDK_VERSION_PATCH=226",
     ],
     defines = select({
         "@org_tensorflow//tensorflow:linux_aarch64": [
             "PLATFORM_LINUX",
             "ENABLE_CURL_CLIENT",
             "ENABLE_NO_ENCRYPTION",
+            "OPENSSL_IS_BORINGSSL",
         ],
         "@org_tensorflow//tensorflow:linux_x86_64": [
             "PLATFORM_LINUX",
             "ENABLE_CURL_CLIENT",
             "ENABLE_NO_ENCRYPTION",
+            "OPENSSL_IS_BORINGSSL",
         ],
         "@org_tensorflow//tensorflow:macos": [
             "PLATFORM_APPLE",
             "ENABLE_CURL_CLIENT",
             "ENABLE_NO_ENCRYPTION",
+            "OPENSSL_IS_BORINGSSL",
         ],
         "@org_tensorflow//tensorflow:linux_ppc64le": [
             "PLATFORM_LINUX",
             "ENABLE_CURL_CLIENT",
             "ENABLE_NO_ENCRYPTION",
+            "OPENSSL_IS_BORINGSSL",
         ],
         "//conditions:default": [],
     }),
@@ -92,7 +102,18 @@ cc_library(
     ],
     deps = [
         "@curl",
+        "@boringssl//:crypto",
+        "@aws-c-common",
+        "@aws-c-event-stream",
+        "@aws-checksums",
     ],
+    copts = [
+        "-DENABLE_OPENSSL_ENCRYPTION",
+        "-DAWS_SDK_VERSION_MAJOR=1",
+        "-DAWS_SDK_VERSION_MINOR=7",
+        "-DAWS_SDK_VERSION_PATCH=226",
+        "-DOPENSSL_IS_BORINGSSL",
+    ], 
 )
 
 template_rule(
diff --git a/third_party/aws/aws-c-common.bazel b/third_party/aws/aws-c-common.bazel
new file mode 100644
index 00000000000..97f258b8200
--- /dev/null
+++ b/third_party/aws/aws-c-common.bazel
@@ -0,0 +1,67 @@
+# Description:
+# AWS C++ SDK
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("@org_tensorflow//third_party:common.bzl", "template_rule")
+
+cc_library(
+    name = "aws-c-common",
+    srcs = select({
+        "@org_tensorflow//tensorflow:linux_aarch64": glob([
+            "source/posix/*.c",
+        ]),
+        "@org_tensorflow//tensorflow:linux_x86_64": glob([
+            "source/posix/*.c",
+        ]),
+        "@org_tensorflow//tensorflow:macos": glob([
+            "source/posix/*.c",
+        ]),
+        "@org_tensorflow//tensorflow:linux_ppc64le": glob([
+            "source/posix/*.c",
+        ]),
+        "@org_tensorflow//tensorflow:raspberry_pi_armeabi": glob([
+            "source/posix/*.c",
+        ]),
+        "//conditions:default": [],
+    }) + glob([
+        "source/*.c",
+        "include/aws/common/*.h",
+        "include/**/*.h",
+    ]),
+    hdrs = [
+    "include/aws/common/config.h",
+    ],
+    includes = [
+        "include/",
+    ],
+    deps = [
+
+    ],
+    copts = [
+      "-std=c99",
+      "-D_POSIX_C_SOURCE=199309L",
+      "-D_GNU_SOURCE",
+    ],
+    linkopts = [
+      "-lrt",
+      "-pthread",
+    ],
+)
+
+template_rule(
+    name = "config_h",
+    src = "include/aws/common/config.h.in",
+    out = "include/aws/common/config.h",
+    substitutions = {
+        "cmakedefine AWS_HAVE_GCC_OVERFLOW_MATH_EXTENSIONS": "undef AWS_HAVE_GCC_OVERFLOW_MATH_EXTENSIONS",
+        "cmakedefine AWS_HAVE_GCC_INLINE_ASM": "define AWS_HAVE_GCC_INLINE_ASM",
+        "cmakedefine AWS_HAVE_MSVC_MULX": "undef AWS_HAVE_MSVC_MULX",
+        "cmakedefine AWS_HAVE_EXECINFO": "define AWS_HAVE_EXECINFO",
+    },
+)
+
diff --git a/third_party/aws/aws-c-event-stream.bazel b/third_party/aws/aws-c-event-stream.bazel
new file mode 100644
index 00000000000..898ab6f7bab
--- /dev/null
+++ b/third_party/aws/aws-c-event-stream.bazel
@@ -0,0 +1,35 @@
+# Description:
+#   AWS C++ SDK
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+
+cc_library(
+    name = "aws-c-event-stream",
+    srcs = glob([
+        "source/*.c",
+        "include/**/*.h",
+    ]),
+    hdrs = [
+    ],
+    includes = [
+        "include/",
+    ],
+    deps = [
+        "@aws-c-common",
+        "@aws-checksums",
+    ],
+    copts = [
+        "-std=c99",
+        "-D_POSIX_C_SOURCE=199309L",
+        "-D_GNU_SOURCE",
+    ],
+    linkopts = [
+        "-lrt",
+        "-pthread",
+    ],
+)
diff --git a/third_party/aws/aws-checksums.bazel b/third_party/aws/aws-checksums.bazel
new file mode 100644
index 00000000000..4cc42f32f74
--- /dev/null
+++ b/third_party/aws/aws-checksums.bazel
@@ -0,0 +1,35 @@
+# Description:
+#   AWS C++ SDK
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+
+cc_library(
+    name = "aws-checksums",
+    srcs = glob([
+        "source/intel/*.c",
+        "source/*.c",
+        "include/**/*.h",
+    ]),
+    hdrs = [
+    ],
+    includes = [
+        "include/",
+    ],
+    deps = [
+        "@aws-c-common",
+    ],
+    copts = [
+       #"-std=c99",
+       #"-D_POSIX_C_SOURCE=199309L",
+       #"-D_GNU_SOURCE",
+    ],
+    linkopts = [
+       #"-lrt",
+       #"-pthread",
+    ],
+)
diff --git a/third_party/aws/workspace.bzl b/third_party/aws/workspace.bzl
index f37699e34c5..facf1e7758d 100644
--- a/third_party/aws/workspace.bzl
+++ b/third_party/aws/workspace.bzl
@@ -9,10 +9,44 @@ def repo():
     third_party_http_archive(
         name = "aws",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/aws/aws-sdk-cpp/archive/1.5.8.tar.gz",
-            "https://github.com/aws/aws-sdk-cpp/archive/1.5.8.tar.gz",
+            "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.7.226.tar.gz",
+            "https://github.com/aws/aws-sdk-cpp/archive/1.7.226.tar.gz",
         ],
-        sha256 = "89905075fe50aa13e0337ff905c2e8c1ce9caf77a3504484a7cda39179120ffc",
-        strip_prefix = "aws-sdk-cpp-1.5.8",
+        sha256 = "3a6eff15ee73a1a73c4c16ef2582eaef8647821750dab6d5cd0f137103b5c488",
+        strip_prefix = "aws-sdk-cpp-1.7.226",
         build_file = "//third_party/aws:BUILD.bazel",
     )
+ 
+    third_party_http_archive( 
+        name = "aws-c-common",
+        urls = [
+            "http://mirror.tensorflow.org/github.com/awslabs/aws-c-common/archive/v0.4.20.tar.gz",
+            "https://github.com/awslabs/aws-c-common/archive/v0.4.20.tar.gz"
+        ],
+        sha256 = "b0a86df4731fb5de00c5caaf95450ca26a1c0405919aee39927a9455bc5a6b05",
+        strip_prefix = "aws-c-common-0.4.20",
+        build_file = "//third_party/aws:aws-c-common.bazel",
+    )
+      
+    third_party_http_archive( 
+        name = "aws-c-event-stream",
+        urls = [
+            "https://mirror.tensorflow.org/github.com/awslabs/aws-c-event-stream/archive/v0.1.4.tar.gz",
+            "https://github.com/awslabs/aws-c-event-stream/archive/v0.1.4.tar.gz",
+        ],
+        sha256 = "31d880d1c868d3f3df1e1f4b45e56ac73724a4dc3449d04d47fc0746f6f077b6",
+        strip_prefix = "aws-c-event-stream-0.1.4",
+        build_file = "//third_party/aws:aws-c-event-stream.bazel",
+    )
+      
+    third_party_http_archive( 
+        name = "aws-checksums",
+        urls = [
+            "https://mirror.tensorflow.org/github.com/awslabs/aws-checksums/archive/v0.1.5.tar.gz",
+            "https://github.com/awslabs/aws-checksums/archive/v0.1.5.tar.gz",
+        ],
+        sha256 = "6e6bed6f75cf54006b6bafb01b3b96df19605572131a2260fddaf0e87949ced0",
+        strip_prefix = "aws-checksums-0.1.5",
+        build_file = "//third_party/aws:aws-checksums.bazel",
+    )
+  

From 6ebeee20e7bcec7074765421493fa288c6984c7c Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Fri, 31 Jan 2020 21:14:31 +0000
Subject: [PATCH 040/442] Remove repeated section from bazel build file

---
 third_party/aws/BUILD.bazel | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/third_party/aws/BUILD.bazel b/third_party/aws/BUILD.bazel
index a7a114a1714..eb3a555d390 100644
--- a/third_party/aws/BUILD.bazel
+++ b/third_party/aws/BUILD.bazel
@@ -1,5 +1,5 @@
 # Description:
-#   AWS C++ SDK
+#  AWS C++ SDK
 
 package(default_visibility = ["//visibility:public"])
 
@@ -64,11 +64,6 @@ cc_library(
     hdrs = [
         "aws-cpp-sdk-core/include/aws/core/SDKConfig.h",
     ],
-    copts = [
-        "-DAWS_SDK_VERSION_MAJOR=1",
-        "-DAWS_SDK_VERSION_MINOR=7",
-        "-DAWS_SDK_VERSION_PATCH=226",
-    ],
     defines = select({
         "@org_tensorflow//tensorflow:linux_aarch64": [
             "PLATFORM_LINUX",

From 902a7afe493265d76b2c8bd5e0ebfc267cb556a9 Mon Sep 17 00:00:00 2001
From: Gaurav Singh <gaurav1086@gmail.com>
Date: Sat, 1 Feb 2020 06:57:38 -0500
Subject: [PATCH 041/442] [lite] check index channel before accessing
 center_frequencies_

---
 tensorflow/lite/kernels/internal/mfcc_mel_filterbank.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/mfcc_mel_filterbank.cc b/tensorflow/lite/kernels/internal/mfcc_mel_filterbank.cc
index 4f22517866e..ac0df209750 100644
--- a/tensorflow/lite/kernels/internal/mfcc_mel_filterbank.cc
+++ b/tensorflow/lite/kernels/internal/mfcc_mel_filterbank.cc
@@ -99,8 +99,8 @@ bool MfccMelFilterbank::Initialize(int input_length, double input_sample_rate,
     if ((i < start_index_) || (i > end_index_)) {
       band_mapper_[i] = -2;  // Indicate an unused Fourier coefficient.
     } else {
-      while ((center_frequencies_[channel] < melf) &&
-             (channel < num_channels_)) {
+      while ((channel < num_channels_) &&
+             (center_frequencies_[channel] < melf)) {
         ++channel;
       }
       band_mapper_[i] = channel - 1;  // Can be == -1

From 7372667362c956709c8238cf20109d8d246120db Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Sat, 1 Feb 2020 01:42:38 +0000
Subject: [PATCH 042/442] Upgrade AWS SDK to 266, and upgrade its dependencies

---
 tensorflow/core/platform/s3/s3_file_system.cc |  3 +--
 third_party/aws/BUILD.bazel                   |  2 +-
 third_party/aws/workspace.bzl                 | 16 ++++++++--------
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index ba4528ad272..5253023fbb9 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -622,8 +622,7 @@ Status S3FileSystem::RenameFile(const string& src, const string& target) {
       Aws::String src_key = object.GetKey();
       Aws::String target_key = src_key;
       target_key.replace(0, src_object.length(), target_object.c_str());
-      Aws::String source = Aws::String(src_bucket.c_str()) + "/" +
-                           Aws::Utils::StringUtils::URLEncode(src_key.c_str());
+      Aws::String source = Aws::String(src_bucket.c_str()) + "/" + src_key.c_str();
 
       copyObjectRequest.SetBucket(target_bucket.c_str());
       copyObjectRequest.SetKey(target_key);
diff --git a/third_party/aws/BUILD.bazel b/third_party/aws/BUILD.bazel
index eb3a555d390..f9575453327 100644
--- a/third_party/aws/BUILD.bazel
+++ b/third_party/aws/BUILD.bazel
@@ -106,7 +106,7 @@ cc_library(
         "-DENABLE_OPENSSL_ENCRYPTION",
         "-DAWS_SDK_VERSION_MAJOR=1",
         "-DAWS_SDK_VERSION_MINOR=7",
-        "-DAWS_SDK_VERSION_PATCH=226",
+        "-DAWS_SDK_VERSION_PATCH=266",
         "-DOPENSSL_IS_BORINGSSL",
     ], 
 )
diff --git a/third_party/aws/workspace.bzl b/third_party/aws/workspace.bzl
index facf1e7758d..dae7a9c9264 100644
--- a/third_party/aws/workspace.bzl
+++ b/third_party/aws/workspace.bzl
@@ -9,22 +9,22 @@ def repo():
     third_party_http_archive(
         name = "aws",
         urls = [
-            "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.7.226.tar.gz",
-            "https://github.com/aws/aws-sdk-cpp/archive/1.7.226.tar.gz",
+            "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.7.266.tar.gz",
+            "https://github.com/aws/aws-sdk-cpp/archive/1.7.266.tar.gz",
         ],
-        sha256 = "3a6eff15ee73a1a73c4c16ef2582eaef8647821750dab6d5cd0f137103b5c488",
-        strip_prefix = "aws-sdk-cpp-1.7.226",
+        sha256 = "39fd8a2999260d2b8fcbc8187f1ed5299972c2b8bd14adb7850fd674fea67fb7",
+        strip_prefix = "aws-sdk-cpp-1.7.266",
         build_file = "//third_party/aws:BUILD.bazel",
     )
  
     third_party_http_archive( 
         name = "aws-c-common",
         urls = [
-            "http://mirror.tensorflow.org/github.com/awslabs/aws-c-common/archive/v0.4.20.tar.gz",
-            "https://github.com/awslabs/aws-c-common/archive/v0.4.20.tar.gz"
+            "http://mirror.tensorflow.org/github.com/awslabs/aws-c-common/archive/v0.4.29.tar.gz",
+            "https://github.com/awslabs/aws-c-common/archive/v0.4.29.tar.gz"
         ],
-        sha256 = "b0a86df4731fb5de00c5caaf95450ca26a1c0405919aee39927a9455bc5a6b05",
-        strip_prefix = "aws-c-common-0.4.20",
+        sha256 = "01c2a58553a37b3aa5914d9e0bf7bf14507ff4937bc5872a678892ca20fcae1f",
+        strip_prefix = "aws-c-common-0.4.29",
         build_file = "//third_party/aws:aws-c-common.bazel",
     )
       

From 60481509a9324e38bc221f76adb9c067eb788ff3 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Wed, 5 Feb 2020 02:24:31 +0000
Subject: [PATCH 043/442] Enable encryption while building aws sdk

---
 third_party/aws/BUILD.bazel | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/third_party/aws/BUILD.bazel b/third_party/aws/BUILD.bazel
index f9575453327..2e707fc4d0a 100644
--- a/third_party/aws/BUILD.bazel
+++ b/third_party/aws/BUILD.bazel
@@ -68,25 +68,21 @@ cc_library(
         "@org_tensorflow//tensorflow:linux_aarch64": [
             "PLATFORM_LINUX",
             "ENABLE_CURL_CLIENT",
-            "ENABLE_NO_ENCRYPTION",
             "OPENSSL_IS_BORINGSSL",
         ],
         "@org_tensorflow//tensorflow:linux_x86_64": [
             "PLATFORM_LINUX",
             "ENABLE_CURL_CLIENT",
-            "ENABLE_NO_ENCRYPTION",
             "OPENSSL_IS_BORINGSSL",
         ],
         "@org_tensorflow//tensorflow:macos": [
             "PLATFORM_APPLE",
             "ENABLE_CURL_CLIENT",
-            "ENABLE_NO_ENCRYPTION",
             "OPENSSL_IS_BORINGSSL",
         ],
         "@org_tensorflow//tensorflow:linux_ppc64le": [
             "PLATFORM_LINUX",
             "ENABLE_CURL_CLIENT",
-            "ENABLE_NO_ENCRYPTION",
             "OPENSSL_IS_BORINGSSL",
         ],
         "//conditions:default": [],

From 9d94ec57e7f49cbe717c86c96485b561074a3aeb Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Wed, 5 Feb 2020 16:56:56 +0530
Subject: [PATCH 044/442] Fix linting

---
 tensorflow/python/keras/engine/network.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 0c1a44ac104..2c78e85b598 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -2042,8 +2042,8 @@ def validate_save_format(filepath, save_format):
   to Tensorflow SavedModel or HDF5. Output will default to 'tf' in TF2.X and
   'h5' in TF1.X.
 
-  Defaults to 'h5' if `filepath` is a path to a hdf5 file (having suffix '.h5' or
-  '.hdf5' or '.keras') or is an h5py.File object.
+  Defaults to 'h5' if `filepath` is a path to a hdf5 file (having suffix '.h5'
+  or '.hdf5' or '.keras') or is an h5py.File object.
 
   Args:
     filepath: Value of the `filepath` argument passed to the method.
@@ -2086,14 +2086,13 @@ def validate_save_format(filepath, save_format):
       save_format = 'h5'
     else:
       raise ValueError(
-          'Unknown format "%s". Was expecting one of {"tf", "h5"}.' % (
-          save_format))
+          'Unknown format "%s". Was expecting one of {"tf", "h5"}.'
+          % (save_format))
   if save_format == 'tf' and filepath_is_h5:
     raise ValueError(
         ('Got save_format="tf"/"tensorflow", but the filepath ("%s") looks '
-        'like an HDF5 file. Omit the ".h5"/".keras" when saving in '
-        'TensorFlow format.')
-        % filepath)
+         'like an HDF5 file. Omit the ".h5"/".keras" when saving in '
+         'TensorFlow format.') % filepath)
   if save_format == 'tf' and filepath_is_h5py_file:
     raise ValueError(
         'Got save_format="tf"/"tensorflow", but the given `filepath`'

From cd63276e5a5bb1beb4d6592ae3726d7f8af8c098 Mon Sep 17 00:00:00 2001
From: 372046933 <372046933@users.noreply.github.com>
Date: Thu, 6 Feb 2020 14:55:19 +0800
Subject: [PATCH 045/442] Update nn_impl.py

fix package name
---
 tensorflow/python/ops/nn_impl.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 2c00e051db2..99827a5cfb2 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -401,7 +401,7 @@ def compute_average_loss(per_example_loss,
           labels, predictions)
 
       # Compute loss that is scaled by sample_weight and by global batch size.
-      return tf.compute_average_loss(
+      return tf.nn.compute_average_loss(
           per_example_loss,
           sample_weight=sample_weight,
           global_batch_size=GLOBAL_BATCH_SIZE)
@@ -452,7 +452,7 @@ def scale_regularization_loss(regularization_loss):
           labels, predictions)
 
       # Compute loss that is scaled by sample_weight and by global batch size.
-      loss = tf.compute_average_loss(
+      loss = tf.nn.compute_average_loss(
           per_example_loss,
           sample_weight=sample_weight,
           global_batch_size=GLOBAL_BATCH_SIZE)

From 0d9f813bd5dc877bb901dffc3cca31ff84d5233a Mon Sep 17 00:00:00 2001
From: punndcoder28 <puneethk.cs17@bmsce.ac.in>
Date: Thu, 6 Feb 2020 19:16:25 +0530
Subject: [PATCH 046/442] Changed util.cc

---
 tensorflow/python/util/nest.py         |  4 ++++
 tensorflow/python/util/util_wrapper.cc | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index a39d6190e2b..008685b0d32 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -122,6 +122,7 @@ _is_mapping_view = _pywrap_utils.IsMappingView
 _is_attrs = _pywrap_utils.IsAttrs
 _is_composite_tensor = _pywrap_utils.IsCompositeTensor
 _is_type_spec = _pywrap_utils.IsTypeSpec
+_is_mutable_mapping = _pywrap_utils.IsNestCompatibleMapping
 
 
 def _sequence_like(instance, args):
@@ -168,6 +169,9 @@ def _sequence_like(instance, args):
     # Pack a CompositeTensor's components according to a TypeSpec.
     assert len(args) == 1
     return instance._from_components(args[0])  # pylint: disable=protected-access
+  # elif _is_mutable_mapping(instance):
+  #   new_mapping = instance_type(instance)
+  #   new_mapping.update()
   elif isinstance(instance, _six.moves.range):
     return _sequence_like(list(instance), args)
   elif isinstance(instance, _wrapt.ObjectProxy):
diff --git a/tensorflow/python/util/util_wrapper.cc b/tensorflow/python/util/util_wrapper.cc
index 38915efcfee..c5085cd99ef 100644
--- a/tensorflow/python/util/util_wrapper.cc
+++ b/tensorflow/python/util/util_wrapper.cc
@@ -140,6 +140,24 @@ PYBIND11_MODULE(_pywrap_utils, m) {
       Returns:
         True if `instance` is a `collections.Mapping`.
     )pbdoc");
+    m.def(
+      "IsNestCompatibleMapping",
+      [](const py::handle& o) {
+        bool result = tensorflow::swig::IsNestCompatibleMapping(o.ptr());
+        if (PyErr_Occurred()) {
+          throw py::error_already_set();
+        }
+        return result;
+      },
+      R"pbdoc(
+      Returns True if `instance` is a `collections.MutableMapping`.
+
+      Args:
+        instance: An instance of a Python object.
+
+      Returns:
+        True if `instance` is a `collections.MutableMapping`.
+    )pbdoc");
   m.def(
       "IsMappingView",
       [](const py::handle& o) {

From 7b2f406c3598afc9409e0d51d869457d15493836 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Thu, 6 Feb 2020 22:45:00 +0000
Subject: [PATCH 047/442] Add includes

---
 third_party/aws/aws-c-common.bazel | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/third_party/aws/aws-c-common.bazel b/third_party/aws/aws-c-common.bazel
index 97f258b8200..6c74a8a785a 100644
--- a/third_party/aws/aws-c-common.bazel
+++ b/third_party/aws/aws-c-common.bazel
@@ -29,12 +29,12 @@ cc_library(
         ]),
         "//conditions:default": [],
     }) + glob([
-        "source/*.c",
-        "include/aws/common/*.h",
+        "source/**/*.c",
         "include/**/*.h",
+        "include/**/*.inl"
     ]),
     hdrs = [
-    "include/aws/common/config.h",
+        "include/aws/common/config.h",
     ],
     includes = [
         "include/",

From 0cea45396476eb900090608af4eecf9e81dabaf3 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Thu, 6 Feb 2020 22:54:57 +0000
Subject: [PATCH 048/442] Run buildifier on third part aws bazel files

---
 third_party/aws/BUILD.bazel              | 18 +++++++++---------
 third_party/aws/aws-c-common.bazel       | 14 ++++++--------
 third_party/aws/aws-c-event-stream.bazel |  1 -
 third_party/aws/aws-checksums.bazel      | 11 +++++------
 third_party/aws/workspace.bzl            | 15 +++++++--------
 5 files changed, 27 insertions(+), 32 deletions(-)

diff --git a/third_party/aws/BUILD.bazel b/third_party/aws/BUILD.bazel
index 2e707fc4d0a..81e896d7fc8 100644
--- a/third_party/aws/BUILD.bazel
+++ b/third_party/aws/BUILD.bazel
@@ -64,6 +64,13 @@ cc_library(
     hdrs = [
         "aws-cpp-sdk-core/include/aws/core/SDKConfig.h",
     ],
+    copts = [
+        "-DENABLE_OPENSSL_ENCRYPTION",
+        "-DAWS_SDK_VERSION_MAJOR=1",
+        "-DAWS_SDK_VERSION_MINOR=7",
+        "-DAWS_SDK_VERSION_PATCH=266",
+        "-DOPENSSL_IS_BORINGSSL",
+    ],
     defines = select({
         "@org_tensorflow//tensorflow:linux_aarch64": [
             "PLATFORM_LINUX",
@@ -92,19 +99,12 @@ cc_library(
         "aws-cpp-sdk-s3/include/",
     ],
     deps = [
-        "@curl",
-        "@boringssl//:crypto",
         "@aws-c-common",
         "@aws-c-event-stream",
         "@aws-checksums",
+        "@boringssl//:crypto",
+        "@curl",
     ],
-    copts = [
-        "-DENABLE_OPENSSL_ENCRYPTION",
-        "-DAWS_SDK_VERSION_MAJOR=1",
-        "-DAWS_SDK_VERSION_MINOR=7",
-        "-DAWS_SDK_VERSION_PATCH=266",
-        "-DOPENSSL_IS_BORINGSSL",
-    ], 
 )
 
 template_rule(
diff --git a/third_party/aws/aws-c-common.bazel b/third_party/aws/aws-c-common.bazel
index 6c74a8a785a..edfcbd78394 100644
--- a/third_party/aws/aws-c-common.bazel
+++ b/third_party/aws/aws-c-common.bazel
@@ -31,7 +31,7 @@ cc_library(
     }) + glob([
         "source/**/*.c",
         "include/**/*.h",
-        "include/**/*.inl"
+        "include/**/*.inl",
     ]),
     hdrs = [
         "include/aws/common/config.h",
@@ -40,16 +40,15 @@ cc_library(
         "include/",
     ],
     deps = [
-
     ],
     copts = [
-      "-std=c99",
-      "-D_POSIX_C_SOURCE=199309L",
-      "-D_GNU_SOURCE",
+        "-std=c99",
+        "-D_POSIX_C_SOURCE=199309L",
+        "-D_GNU_SOURCE",
     ],
     linkopts = [
-      "-lrt",
-      "-pthread",
+        "-lrt",
+        "-pthread",
     ],
 )
 
@@ -64,4 +63,3 @@ template_rule(
         "cmakedefine AWS_HAVE_EXECINFO": "define AWS_HAVE_EXECINFO",
     },
 )
-
diff --git a/third_party/aws/aws-c-event-stream.bazel b/third_party/aws/aws-c-event-stream.bazel
index 898ab6f7bab..956670c8d28 100644
--- a/third_party/aws/aws-c-event-stream.bazel
+++ b/third_party/aws/aws-c-event-stream.bazel
@@ -7,7 +7,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-
 cc_library(
     name = "aws-c-event-stream",
     srcs = glob([
diff --git a/third_party/aws/aws-checksums.bazel b/third_party/aws/aws-checksums.bazel
index 4cc42f32f74..0af7c8cd4cf 100644
--- a/third_party/aws/aws-checksums.bazel
+++ b/third_party/aws/aws-checksums.bazel
@@ -7,7 +7,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-
 cc_library(
     name = "aws-checksums",
     srcs = glob([
@@ -24,12 +23,12 @@ cc_library(
         "@aws-c-common",
     ],
     copts = [
-       #"-std=c99",
-       #"-D_POSIX_C_SOURCE=199309L",
-       #"-D_GNU_SOURCE",
+        #"-std=c99",
+        #"-D_POSIX_C_SOURCE=199309L",
+        #"-D_GNU_SOURCE",
     ],
     linkopts = [
-       #"-lrt",
-       #"-pthread",
+        #"-lrt",
+        #"-pthread",
     ],
 )
diff --git a/third_party/aws/workspace.bzl b/third_party/aws/workspace.bzl
index dae7a9c9264..b54a301b3c3 100644
--- a/third_party/aws/workspace.bzl
+++ b/third_party/aws/workspace.bzl
@@ -16,19 +16,19 @@ def repo():
         strip_prefix = "aws-sdk-cpp-1.7.266",
         build_file = "//third_party/aws:BUILD.bazel",
     )
- 
-    third_party_http_archive( 
+
+    third_party_http_archive(
         name = "aws-c-common",
         urls = [
             "http://mirror.tensorflow.org/github.com/awslabs/aws-c-common/archive/v0.4.29.tar.gz",
-            "https://github.com/awslabs/aws-c-common/archive/v0.4.29.tar.gz"
+            "https://github.com/awslabs/aws-c-common/archive/v0.4.29.tar.gz",
         ],
         sha256 = "01c2a58553a37b3aa5914d9e0bf7bf14507ff4937bc5872a678892ca20fcae1f",
         strip_prefix = "aws-c-common-0.4.29",
         build_file = "//third_party/aws:aws-c-common.bazel",
     )
-      
-    third_party_http_archive( 
+
+    third_party_http_archive(
         name = "aws-c-event-stream",
         urls = [
             "https://mirror.tensorflow.org/github.com/awslabs/aws-c-event-stream/archive/v0.1.4.tar.gz",
@@ -38,8 +38,8 @@ def repo():
         strip_prefix = "aws-c-event-stream-0.1.4",
         build_file = "//third_party/aws:aws-c-event-stream.bazel",
     )
-      
-    third_party_http_archive( 
+
+    third_party_http_archive(
         name = "aws-checksums",
         urls = [
             "https://mirror.tensorflow.org/github.com/awslabs/aws-checksums/archive/v0.1.5.tar.gz",
@@ -49,4 +49,3 @@ def repo():
         strip_prefix = "aws-checksums-0.1.5",
         build_file = "//third_party/aws:aws-checksums.bazel",
     )
-  

From dd84af21e203a333e2f9bcf7be6ddf645007fbcf Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Fri, 7 Feb 2020 03:31:46 +0000
Subject: [PATCH 049/442] Add headers to hdrs section of bazel build file '

---
 third_party/aws/aws-c-common.bazel       | 12 +++++++-----
 third_party/aws/aws-c-event-stream.bazel |  6 +++---
 third_party/aws/aws-checksums.bazel      |  6 +++---
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/third_party/aws/aws-c-common.bazel b/third_party/aws/aws-c-common.bazel
index edfcbd78394..bc582157141 100644
--- a/third_party/aws/aws-c-common.bazel
+++ b/third_party/aws/aws-c-common.bazel
@@ -14,6 +14,7 @@ cc_library(
     srcs = select({
         "@org_tensorflow//tensorflow:linux_aarch64": glob([
             "source/posix/*.c",
+            "source/arch/*.c"
         ]),
         "@org_tensorflow//tensorflow:linux_x86_64": glob([
             "source/posix/*.c",
@@ -29,13 +30,14 @@ cc_library(
         ]),
         "//conditions:default": [],
     }) + glob([
-        "source/**/*.c",
-        "include/**/*.h",
-        "include/**/*.inl",
+        "source/*.c",
     ]),
     hdrs = [
-        "include/aws/common/config.h",
-    ],
+        "include/aws/common/config.h" 
+    ] + glob([
+        "include/**/*.h",
+        "include/aws/common/**/*.inl"
+    ]),
     includes = [
         "include/",
     ],
diff --git a/third_party/aws/aws-c-event-stream.bazel b/third_party/aws/aws-c-event-stream.bazel
index 956670c8d28..e2a04ba6fa2 100644
--- a/third_party/aws/aws-c-event-stream.bazel
+++ b/third_party/aws/aws-c-event-stream.bazel
@@ -11,10 +11,10 @@ cc_library(
     name = "aws-c-event-stream",
     srcs = glob([
         "source/*.c",
-        "include/**/*.h",
     ]),
-    hdrs = [
-    ],
+    hdrs = glob([
+        "include/**/*.h"
+    ]),
     includes = [
         "include/",
     ],
diff --git a/third_party/aws/aws-checksums.bazel b/third_party/aws/aws-checksums.bazel
index 0af7c8cd4cf..e4067dbf5b8 100644
--- a/third_party/aws/aws-checksums.bazel
+++ b/third_party/aws/aws-checksums.bazel
@@ -12,10 +12,10 @@ cc_library(
     srcs = glob([
         "source/intel/*.c",
         "source/*.c",
-        "include/**/*.h",
     ]),
-    hdrs = [
-    ],
+    hdrs = glob([
+        "include/**/*.h"
+    ]),
     includes = [
         "include/",
     ],

From 8973baaa12c464d0c20d5ad98dc950a38dd349f0 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Fri, 7 Feb 2020 03:32:55 +0000
Subject: [PATCH 050/442] Remove commented stuff

---
 third_party/aws/aws-checksums.bazel | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/third_party/aws/aws-checksums.bazel b/third_party/aws/aws-checksums.bazel
index e4067dbf5b8..5aa175795b8 100644
--- a/third_party/aws/aws-checksums.bazel
+++ b/third_party/aws/aws-checksums.bazel
@@ -22,13 +22,4 @@ cc_library(
     deps = [
         "@aws-c-common",
     ],
-    copts = [
-        #"-std=c99",
-        #"-D_POSIX_C_SOURCE=199309L",
-        #"-D_GNU_SOURCE",
-    ],
-    linkopts = [
-        #"-lrt",
-        #"-pthread",
-    ],
 )

From d34dfe6bafb0b51a8e6e8278ca36b5b2800c2ad9 Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Fri, 7 Feb 2020 03:47:49 +0000
Subject: [PATCH 051/442] Remove unncessary build options

---
 third_party/aws/aws-c-common.bazel       | 9 ---------
 third_party/aws/aws-c-event-stream.bazel | 9 ---------
 2 files changed, 18 deletions(-)

diff --git a/third_party/aws/aws-c-common.bazel b/third_party/aws/aws-c-common.bazel
index bc582157141..ff58c9125a2 100644
--- a/third_party/aws/aws-c-common.bazel
+++ b/third_party/aws/aws-c-common.bazel
@@ -43,15 +43,6 @@ cc_library(
     ],
     deps = [
     ],
-    copts = [
-        "-std=c99",
-        "-D_POSIX_C_SOURCE=199309L",
-        "-D_GNU_SOURCE",
-    ],
-    linkopts = [
-        "-lrt",
-        "-pthread",
-    ],
 )
 
 template_rule(
diff --git a/third_party/aws/aws-c-event-stream.bazel b/third_party/aws/aws-c-event-stream.bazel
index e2a04ba6fa2..b43e63f2a98 100644
--- a/third_party/aws/aws-c-event-stream.bazel
+++ b/third_party/aws/aws-c-event-stream.bazel
@@ -22,13 +22,4 @@ cc_library(
         "@aws-c-common",
         "@aws-checksums",
     ],
-    copts = [
-        "-std=c99",
-        "-D_POSIX_C_SOURCE=199309L",
-        "-D_GNU_SOURCE",
-    ],
-    linkopts = [
-        "-lrt",
-        "-pthread",
-    ],
 )

From 42b80a5229a01394c23b8aac32aa2f345044f640 Mon Sep 17 00:00:00 2001
From: exfalso <0slemi0@gmail.com>
Date: Fri, 7 Feb 2020 12:37:02 +0100
Subject: [PATCH 052/442] micro: return error when allocation fails in
 MicroAllocator::Init. Fixes #36533

---
 tensorflow/lite/micro/micro_allocator.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index 60417b1547d..08078702f77 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -344,6 +344,7 @@ TfLiteStatus MicroAllocator::Init() {
     error_reporter_->Report(
         "Failed to allocate memory for context->tensors, %d bytes required",
         sizeof(TfLiteTensor) * context_->tensors_size);
+        return kTfLiteError;
   }
 
   // Initialize runtime tensors in context_ using the flatbuffer.

From 40442b4d718d295bd06dfbcafb716c791aecc61b Mon Sep 17 00:00:00 2001
From: Puneeth K <puneethk.cs17@bmsce.ac.in>
Date: Fri, 7 Feb 2020 22:06:34 +0530
Subject: [PATCH 053/442] Updated util.py

---
 tensorflow/python/util/nest.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 008685b0d32..5a929149dff 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -137,7 +137,17 @@ def _sequence_like(instance, args):
   Returns:
     `args` with the type of `instance`.
   """
-  if _is_mapping(instance):
+  if _is_mutable_mapping(instance):
+    result = dict(zip(_sorted(instance), args))
+    instance_type = type(instance)
+    if instance_type == _collections.OrderedDict:
+      d = _collections.OrderedDict(instance.default_factory)
+      for key in instance:
+        d[key] = result[key]
+      return d
+    else:
+      return instance_type((key, result[key]) for key in instance)
+  elif _is_mapping(instance):
     # Pack dictionaries in a deterministic order by sorting the keys.
     # Notice this means that we ignore the original order of `OrderedDict`
     # instances. This is intentional, to avoid potential bugs caused by mixing
@@ -169,9 +179,6 @@ def _sequence_like(instance, args):
     # Pack a CompositeTensor's components according to a TypeSpec.
     assert len(args) == 1
     return instance._from_components(args[0])  # pylint: disable=protected-access
-  # elif _is_mutable_mapping(instance):
-  #   new_mapping = instance_type(instance)
-  #   new_mapping.update()
   elif isinstance(instance, _six.moves.range):
     return _sequence_like(list(instance), args)
   elif isinstance(instance, _wrapt.ObjectProxy):

From 93569cb564562739c4283c04cc8f2450bc072994 Mon Sep 17 00:00:00 2001
From: 372046933 <372046933@users.noreply.github.com>
Date: Sat, 8 Feb 2020 14:58:24 +0800
Subject: [PATCH 054/442] Update nn_impl.py

---
 tensorflow/python/ops/nn_impl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 99827a5cfb2..f4b1caa809f 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -458,7 +458,7 @@ def scale_regularization_loss(regularization_loss):
           global_batch_size=GLOBAL_BATCH_SIZE)
 
       # Add scaled regularization losses.
-      loss += tf.scale_regularization_loss(tf.nn.l2_loss(weights))
+      loss += tf.nn.scale_regularization_loss(tf.nn.l2_loss(weights))
       return loss
   ```
 

From 4c73481e6bb0762452940878712f78e0a7cb39c6 Mon Sep 17 00:00:00 2001
From: Puneeth K <puneethk.cs17@bmsce.ac.in>
Date: Sat, 8 Feb 2020 18:11:42 +0530
Subject: [PATCH 055/442] Modified _sequence_like

---
 tensorflow/python/util/nest.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 5a929149dff..b1fdb76c4c7 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -138,21 +138,19 @@ def _sequence_like(instance, args):
     `args` with the type of `instance`.
   """
   if _is_mutable_mapping(instance):
-    result = dict(zip(_sorted(instance), args))
-    instance_type = type(instance)
-    if instance_type == _collections.OrderedDict:
-      d = _collections.OrderedDict(instance.default_factory)
-      for key in instance:
-        d[key] = result[key]
-      return d
-    else:
-      return instance_type((key, result[key]) for key in instance)
-  elif _is_mapping(instance):
     # Pack dictionaries in a deterministic order by sorting the keys.
     # Notice this means that we ignore the original order of `OrderedDict`
     # instances. This is intentional, to avoid potential bugs caused by mixing
     # ordered and plain dicts (e.g., flattening a dict but using a
     # corresponding `OrderedDict` to pack it back).
+    result = dict(zip(_sorted(instance), args))
+    instance_type = type(instance)
+    if instance_type == _collections.defaultdict:
+      d = instance_type()
+      for key in instance:
+        d[key] = result[key]
+      return d
+  elif _is_mapping(instance):
     result = dict(zip(_sorted(instance), args))
     instance_type = type(instance)
     if instance_type == _collections.defaultdict:

From 052408c4ad7898a6d35511689f0b8339a201ebee Mon Sep 17 00:00:00 2001
From: Puneeth K <puneethk.cs17@bmsce.ac.in>
Date: Sat, 8 Feb 2020 19:50:23 +0530
Subject: [PATCH 056/442] Added mutable mapping support

---
 tensorflow/python/util/nest.py         | 2 +-
 tensorflow/python/util/util.cc         | 5 ++++-
 tensorflow/python/util/util.h          | 9 +++++++++
 tensorflow/python/util/util_wrapper.cc | 4 ++--
 4 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index b1fdb76c4c7..fa6f9a209c2 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -122,7 +122,7 @@ _is_mapping_view = _pywrap_utils.IsMappingView
 _is_attrs = _pywrap_utils.IsAttrs
 _is_composite_tensor = _pywrap_utils.IsCompositeTensor
 _is_type_spec = _pywrap_utils.IsTypeSpec
-_is_mutable_mapping = _pywrap_utils.IsNestCompatibleMapping
+_is_mutable_mapping = _pywrap_utils.IsMutableMapping
 
 
 def _sequence_like(instance, args):
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index aa02b33e4c8..cc163898d28 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -221,7 +221,10 @@ int IsMappingHelper(PyObject* o) {
   return check_cache->CachedLookup(o);
 }
 
-int IsNestCompatibleMappingHelper(PyObject* o) {
+// Returns 1 if `o` is considered a mutable mapping for the purposes of Flatten().
+// Returns 0 otherwise.
+// Returns -1 if an error occurred.
+int IsMutableMappingHelper(PyObject* o) {
   static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
     return IsInstanceOfRegisteredType(to_check, "MutableMapping");
   });
diff --git a/tensorflow/python/util/util.h b/tensorflow/python/util/util.h
index 7cd4b0cb495..0f08c729d7e 100644
--- a/tensorflow/python/util/util.h
+++ b/tensorflow/python/util/util.h
@@ -86,6 +86,15 @@ PyObject* IsNamedtuple(PyObject* o, bool strict);
 //   True if the sequence subclasses mapping.
 bool IsMapping(PyObject* o);
 
+// Returns a true if its input is a collections.MutableMapping.
+//
+// Args:
+//   seq: the input to be checked.
+//
+// Returns:
+//   True if the sequence subclasses mapping.
+bool IsMutableMapping(PyObject* o);
+
 // Returns a true if its input is a (possibly wrapped) tuple.
 //
 // Args:
diff --git a/tensorflow/python/util/util_wrapper.cc b/tensorflow/python/util/util_wrapper.cc
index c5085cd99ef..1d4274de7c0 100644
--- a/tensorflow/python/util/util_wrapper.cc
+++ b/tensorflow/python/util/util_wrapper.cc
@@ -141,9 +141,9 @@ PYBIND11_MODULE(_pywrap_utils, m) {
         True if `instance` is a `collections.Mapping`.
     )pbdoc");
     m.def(
-      "IsNestCompatibleMapping",
+      "IsMutableMapping",
       [](const py::handle& o) {
-        bool result = tensorflow::swig::IsNestCompatibleMapping(o.ptr());
+        bool result = tensorflow::swig::IsMutableMapping(o.ptr());
         if (PyErr_Occurred()) {
           throw py::error_already_set();
         }

From 92b77b5329d1dccc12e4cc4b759da8bfe1ba7315 Mon Sep 17 00:00:00 2001
From: Phil Pearl <phil.pearl@ravelin.com>
Date: Sat, 8 Feb 2020 15:47:28 +0000
Subject: [PATCH 057/442] Extend Go benchmarks

---
 tensorflow/go/tensor_test.go | 62 ++++++++++++++++++++++++------------
 1 file changed, 41 insertions(+), 21 deletions(-)

diff --git a/tensorflow/go/tensor_test.go b/tensorflow/go/tensor_test.go
index dc533cd3e1c..ece34a4dd54 100644
--- a/tensorflow/go/tensor_test.go
+++ b/tensorflow/go/tensor_test.go
@@ -18,6 +18,7 @@ package tensorflow
 
 import (
 	"bytes"
+	"fmt"
 	"io"
 	"reflect"
 	"testing"
@@ -276,6 +277,7 @@ func TestReadTensorReadAll(t *testing.T) {
 }
 
 func benchmarkNewTensor(b *testing.B, v interface{}) {
+	b.ReportAllocs()
 	for i := 0; i < b.N; i++ {
 		if t, err := NewTensor(v); err != nil || t == nil {
 			b.Fatalf("(%v, %v)", t, err)
@@ -283,32 +285,50 @@ func benchmarkNewTensor(b *testing.B, v interface{}) {
 	}
 }
 
-func BenchmarkNewTensor(b *testing.B) {
-	var (
-		// Some sample sizes from the Inception image labeling model.
-		// Where input tensors correspond to a 224x224 RGB image
-		// flattened into a vector.
-		vector [224 * 224 * 3]int32
-	)
-	b.Run("[150528]", func(b *testing.B) { benchmarkNewTensor(b, vector) })
-}
+func benchmarkValueTensor(b *testing.B, v interface{}) {
+	t, err := NewTensor(v)
+	if err != nil {
+		b.Fatalf("(%v, %v)", t, err)
+	}
+	b.ReportAllocs()
+	b.ResetTimer()
 
-func benchmarkDecodeTensor(b *testing.B, t *Tensor) {
 	for i := 0; i < b.N; i++ {
 		_ = t.Value()
 	}
 }
 
-func BenchmarkDecodeTensor(b *testing.B) {
-	var (
-		// Some sample sizes from the Inception image labeling model.
-		// Where input tensors correspond to a 224x224 RGB image
-		// flattened into a vector.
-		vector [224 * 224 * 3]int32
-	)
-	t, err := NewTensor(vector)
-	if err != nil {
-		b.Fatalf("(%v, %v)", t, err)
+func BenchmarkTensor(b *testing.B) {
+	// Some sample sizes from the Inception image labeling model.
+	// Where input tensors correspond to a 224x224 RGB image
+	// flattened into a vector.
+	var vector [224 * 224 * 3]int32
+
+	l3 := make([][][]float32, 100)
+	l2 := make([][]float32, 100*100)
+	l1 := make([]float32, 100*100*100)
+	for i := range l2 {
+		l2[i] = l1[i*100 : (i+1)*100]
 	}
-	b.Run("[150528]", func(b *testing.B) { benchmarkDecodeTensor(b, t) })
+	for i := range l3 {
+		l3[i] = l2[i*100 : (i+1)*100]
+	}
+
+	tests := []interface{}{
+		vector,
+		l1,
+		l2,
+		l3,
+	}
+	b.Run("New", func(b *testing.B) {
+		for _, test := range tests {
+			b.Run(fmt.Sprintf("%T", test), func(b *testing.B) { benchmarkNewTensor(b, test) })
+		}
+	})
+	b.Run("Value", func(b *testing.B) {
+		for _, test := range tests {
+			b.Run(fmt.Sprintf("%T", test), func(b *testing.B) { benchmarkValueTensor(b, test) })
+		}
+	})
+
 }

From af6ec41ef315f841a91ccca97dfa7ebe3cd0ca82 Mon Sep 17 00:00:00 2001
From: Phil Pearl <phil.pearl@ravelin.com>
Date: Sat, 8 Feb 2020 17:01:47 +0000
Subject: [PATCH 058/442] Go: NewTensor peformance improvement

Avoid binary.Write for slices and arrays
---
 tensorflow/go/tensor.go | 38 +++++++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index 9bc643ae6d2..b6c4237601a 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -329,11 +329,19 @@ func encodeTensor(w *bytes.Buffer, v reflect.Value, shape []int64) error {
 			}
 		}
 
-		// Optimisation: if only one dimension is left we can use binary.Write() directly for this slice
+		// Optimisation: if only one dimension is left we can write the full
+		// slice or array in one go.
 		if len(shape) == 1 && v.Len() > 0 {
 			switch v.Index(0).Kind() {
-			case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
-				return binary.Write(w, nativeEndian, v.Interface())
+			case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128, reflect.Bool:
+				elt := v.Index(0)
+				if !elt.CanAddr() {
+					// Very frustrating that Go won't give us an address at this
+					// point.
+					return binary.Write(w, nativeEndian, v.Interface())
+				}
+				ptr := unsafe.Pointer(elt.Addr().Pointer())
+				return copyPtr(w, ptr, v.Len()*int(elt.Type().Size()))
 			}
 		}
 
@@ -351,6 +359,30 @@ func encodeTensor(w *bytes.Buffer, v reflect.Value, shape []int64) error {
 	return nil
 }
 
+// sliceHeader is a safer version of reflect.SliceHeader. Using unsafe.Pointer
+// for Data reduces potential issues with the GC. The reflect package uses a
+// similar struct internally.
+type sliceHeader struct {
+	Data unsafe.Pointer
+	Len  int
+	Cap  int
+}
+
+// copyPtr copies the backing data for a slice or array directly into w. Note
+// we don't need to worry about byte ordering because we want the natural byte
+// order for the machine we're running on.
+func copyPtr(w *bytes.Buffer, ptr unsafe.Pointer, l int) error {
+	h := sliceHeader{
+		Data: ptr,
+		Len:  l,
+		Cap:  l,
+	}
+	// Convert our slice header into a []byte so we can call w.Write
+	b := *(*[]byte)(unsafe.Pointer(&h))
+	_, err := w.Write(b)
+	return err
+}
+
 // decodeTensor decodes the Tensor from the buffer to ptr using the format
 // specified in c_api.h. Use stringDecoder for String tensors.
 func decodeTensor(r *bytes.Reader, shape []int64, typ reflect.Type, ptr reflect.Value) error {

From 22295c2991245b452d169c71f02f98d68675ada8 Mon Sep 17 00:00:00 2001
From: Phil Pearl <phil.pearl@ravelin.com>
Date: Sun, 9 Feb 2020 09:27:29 +0000
Subject: [PATCH 059/442] go: Improve NewTensor for primitive and array types

Apply performance improvements to arrays. Tidy up code.
---
 tensorflow/go/tensor.go      | 132 +++++++++++++++++++++--------------
 tensorflow/go/tensor_test.go |   2 +
 2 files changed, 82 insertions(+), 52 deletions(-)

diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index b6c4237601a..c84a8732a63 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -94,9 +94,22 @@ func NewTensor(value interface{}) (*Tensor, error) {
 	raw := tensorData(t.c)
 	buf := bytes.NewBuffer(raw[:0:len(raw)])
 	if dataType != String {
-		if err := encodeTensor(buf, val, shape); err != nil {
-			return nil, err
+		if isAllArray(val.Type()) {
+			// We have arrays all the way down, or just primitive types. We can
+			// just copy the memory in as it is all contiguous.
+			if err := copyPtr(buf, unpackEFace(value).data, int(val.Type().Size())); err != nil {
+				return nil, err
+			}
+		} else {
+			// When there are slices involved the memory for each leaf slice may
+			// not be contiguous with the others or in the order we might
+			// expect, so we need to work our way down to each slice of
+			// primitives and copy them individually
+			if err := encodeTensorWithSlices(buf, val, shape); err != nil {
+				return nil, err
+			}
 		}
+
 		if uintptr(buf.Len()) != nbytes {
 			return nil, bug("NewTensor incorrectly calculated the size of a tensor with type %v and shape %v as %v bytes instead of %v", dataType, shape, nbytes, buf.Len())
 		}
@@ -112,6 +125,43 @@ func NewTensor(value interface{}) (*Tensor, error) {
 	return t, nil
 }
 
+// isAllArray returns true if type is a primitive type or an array of primitive
+// types or an array of ... etc.. When this is true the data we want is
+// contiguous in RAM.
+func isAllArray(typ reflect.Type) bool {
+	switch typ.Kind() {
+	case reflect.Slice:
+		return false
+	case reflect.Array:
+		return isAllArray(typ.Elem())
+	default:
+		// We know the type is slices/arrays of slices/arrays of primitive types.
+		return true
+	}
+}
+
+// eface defines what an interface type actually is: a pointer to type
+// information about the encapsulated type and a pointer to the encapsulated
+// value.
+type eface struct {
+	rtype unsafe.Pointer
+	data  unsafe.Pointer
+}
+
+// unpackEFace gives us an effient way to get us a pointer to the value carried
+// in an interface. If you wrap a pointer type in an interface then the pointer
+// is directly stored in the interface struct. If you wrap a value type in an
+// interface then the compiler copies the value into a newly allocated piece of
+// memory and stores a pointer to that memory in the interface. So we're
+// guaranteed to get a pointer. Go reflection doesn't expose the pointer to
+// value types straightforwardly as it doesn't want you to think you have a
+// reference to the original value. But we just want a pointer to make it
+// efficient to read the value, so cheating like this should be safe and
+// reasonable.
+func unpackEFace(obj interface{}) *eface {
+	return (*eface)(unsafe.Pointer(&obj))
+}
+
 // ReadTensor constructs a Tensor with the provided type and shape from the
 // serialized tensor contents in r.
 //
@@ -302,60 +352,38 @@ func byteSizeOfEncodedStrings(val interface{}) uintptr {
 	return size
 }
 
-// encodeTensor writes v to the specified buffer using the format specified in
+// encodeTensorWithSlices writes v to the specified buffer using the format specified in
 // c_api.h. Use stringEncoder for String tensors.
-func encodeTensor(w *bytes.Buffer, v reflect.Value, shape []int64) error {
-	switch v.Kind() {
-	case reflect.Bool:
-		b := byte(0)
-		if v.Bool() {
-			b = 1
+func encodeTensorWithSlices(w *bytes.Buffer, v reflect.Value, shape []int64) error {
+	// If current dimension is a slice, verify that it has the expected size
+	// Go's type system makes that guarantee for arrays.
+	if v.Kind() == reflect.Slice {
+		expected := int(shape[0])
+		if v.Len() != expected {
+			return fmt.Errorf("mismatched slice lengths: %d and %d", v.Len(), expected)
 		}
-		if err := w.WriteByte(b); err != nil {
-			return err
-		}
-	case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
-		if err := binary.Write(w, nativeEndian, v.Interface()); err != nil {
-			return err
-		}
-
-	case reflect.Array, reflect.Slice:
-		// If current dimension is a slice, verify that it has the expected size
-		// Go's type system makes that guarantee for arrays.
-		if v.Kind() == reflect.Slice {
-			expected := int(shape[0])
-			if v.Len() != expected {
-				return fmt.Errorf("mismatched slice lengths: %d and %d", v.Len(), expected)
-			}
-		}
-
-		// Optimisation: if only one dimension is left we can write the full
-		// slice or array in one go.
-		if len(shape) == 1 && v.Len() > 0 {
-			switch v.Index(0).Kind() {
-			case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128, reflect.Bool:
-				elt := v.Index(0)
-				if !elt.CanAddr() {
-					// Very frustrating that Go won't give us an address at this
-					// point.
-					return binary.Write(w, nativeEndian, v.Interface())
-				}
-				ptr := unsafe.Pointer(elt.Addr().Pointer())
-				return copyPtr(w, ptr, v.Len()*int(elt.Type().Size()))
-			}
-		}
-
-		subShape := shape[1:]
-		for i := 0; i < v.Len(); i++ {
-			err := encodeTensor(w, v.Index(i), subShape)
-			if err != nil {
-				return err
-			}
-		}
-
-	default:
+	} else if v.Kind() != reflect.Array {
 		return fmt.Errorf("unsupported type %v", v.Type())
 	}
+
+	// Once we have just a single dimension we can just copy the data
+	if len(shape) == 1 && v.Len() > 0 {
+		elt := v.Index(0)
+		if !elt.CanAddr() {
+			panic("cannot take address")
+		}
+		ptr := unsafe.Pointer(elt.Addr().Pointer())
+		return copyPtr(w, ptr, v.Len()*int(elt.Type().Size()))
+	}
+
+	subShape := shape[1:]
+	for i := 0; i < v.Len(); i++ {
+		err := encodeTensorWithSlices(w, v.Index(i), subShape)
+		if err != nil {
+			return err
+		}
+	}
+
 	return nil
 }
 
diff --git a/tensorflow/go/tensor_test.go b/tensorflow/go/tensor_test.go
index ece34a4dd54..4d2df3a97dd 100644
--- a/tensorflow/go/tensor_test.go
+++ b/tensorflow/go/tensor_test.go
@@ -303,6 +303,7 @@ func BenchmarkTensor(b *testing.B) {
 	// Where input tensors correspond to a 224x224 RGB image
 	// flattened into a vector.
 	var vector [224 * 224 * 3]int32
+	var arrays [100][100][100]int32
 
 	l3 := make([][][]float32, 100)
 	l2 := make([][]float32, 100*100)
@@ -316,6 +317,7 @@ func BenchmarkTensor(b *testing.B) {
 
 	tests := []interface{}{
 		vector,
+		arrays,
 		l1,
 		l2,
 		l3,

From 147f27254118b7614a667b02e56378654fbda213 Mon Sep 17 00:00:00 2001
From: Phil Pearl <phil.pearl@ravelin.com>
Date: Sun, 9 Feb 2020 11:54:55 +0000
Subject: [PATCH 060/442] go: Improve perf of Value for non-string Tensors

---
 tensorflow/go/tensor.go | 146 ++++++++++++++++++++++++----------------
 1 file changed, 87 insertions(+), 59 deletions(-)

diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index c84a8732a63..0ce080d8bd5 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -218,23 +218,90 @@ func (t *Tensor) Shape() []int64 { return t.shape }
 // Tensor(int64, 0): int64
 // Tensor(float64, 3): [][][]float64
 func (t *Tensor) Value() interface{} {
-	typ := typeOf(t.DataType(), t.Shape())
-	val := reflect.New(typ)
 	raw := tensorData(t.c)
-	if t.DataType() != String {
-		if err := decodeTensor(bytes.NewReader(raw), t.Shape(), typ, val); err != nil {
-			panic(bug("unable to decode Tensor of type %v and shape %v - %v", t.DataType(), t.Shape(), err))
-		}
-	} else {
-		nflattened := numElements(t.Shape())
-		d := stringDecoder{offsets: bytes.NewReader(raw[0 : 8*nflattened]), data: raw[8*nflattened:], status: newStatus()}
-		if err := d.decode(val, t.Shape()); err != nil {
-			panic(bug("unable to decode String tensor with shape %v - %v", t.Shape(), err))
-		}
+	shape := t.Shape()
+	dt := t.DataType()
+	if dt != String {
+		return decodeTensor(raw, shape, dt).Interface()
+	}
+
+	typ := typeOf(dt, shape)
+	val := reflect.New(typ)
+	nflattened := numElements(shape)
+	d := stringDecoder{offsets: bytes.NewReader(raw[0 : 8*nflattened]), data: raw[8*nflattened:], status: newStatus()}
+	if err := d.decode(val, shape); err != nil {
+		panic(bug("unable to decode String tensor with shape %v - %v", shape, err))
 	}
 	return reflect.Indirect(val).Interface()
 }
 
+func decodeTensor(raw []byte, shape []int64, dt DataType) reflect.Value {
+	typ := typeForDataType(dt)
+	// Create a 1-dimensional slice of the base large enough for the data and
+	// copy the data in.
+	n := int(numElements(shape))
+	l := n * int(typ.Size())
+	typ = reflect.SliceOf(typ)
+	slice := reflect.MakeSlice(typ, n, n)
+	h := sliceHeader{
+		Data: unsafe.Pointer(slice.Pointer()),
+		Len:  l,
+		Cap:  l,
+	}
+	baseBytes := *(*[]byte)(unsafe.Pointer(&h))
+	copy(baseBytes, raw)
+	// Now we have the data in place in the base slice we can add the
+	// dimensions. We want to walk backwards through the shape. If the shape is
+	// length 1 or 0 then we're already done.
+	if len(shape) == 0 {
+		return slice.Index(0)
+	}
+	if len(shape) == 1 {
+		return slice
+	}
+	// We have a special case if the tensor has no data. Our backing slice is
+	// empty, but we still want to create slices following the shape. In this
+	// case only the final part of the shape will be 0 and we want to recalculate
+	// n at this point ignoring that 0.
+	// For example if our shape is 3 * 2 * 0 then n will be zero, but we still
+	// want 6 zero length slices to group as follows.
+	// {{} {}} {{} {}} {{} {}}
+	if n == 0 {
+		n = int(numElements(shape[:len(shape)-1]))
+	}
+	for i := len(shape) - 2; i >= 0; i-- {
+		underlyingSize := typ.Elem().Size()
+		typ = reflect.SliceOf(typ)
+		subsliceLen := int(shape[i+1])
+		if subsliceLen != 0 {
+			n = n / subsliceLen
+		}
+		// Just using reflection it is difficult to avoid unnecessary
+		// allocations while setting up the sub-slices as the Slice function on
+		// a slice Value allocates. So we end up doing pointer arithmetic!
+		// Pointer() on a slice gives us access to the data backing the slice.
+		// We insert slice headers directly into this data.
+		data := slice.Pointer()
+		nextSlice := reflect.MakeSlice(typ, n, n)
+		nextData := nextSlice.Pointer()
+		const sliceSize = unsafe.Sizeof(sliceHeader{})
+		for j := 0; j < n; j++ {
+			// This is equivalent to h := slice[j*subsliceLen: (j+1)*subsliceLen]
+			h := sliceHeader{
+				Data: unsafe.Pointer(data + (uintptr(j*subsliceLen) * underlyingSize)),
+				Len:  subsliceLen,
+				Cap:  subsliceLen,
+			}
+
+			// This is equivalent to nSlice[j] = h
+			*(*sliceHeader)(unsafe.Pointer(nextData + (uintptr(j) * sliceSize))) = h
+		}
+
+		slice = nextSlice
+	}
+	return slice
+}
+
 // WriteContentsTo writes the serialized contents of t to w.
 //
 // Returns the number of bytes written. See ReadTensor for
@@ -311,18 +378,18 @@ func shapeAndDataTypeOf(val reflect.Value) (shape []int64, dt DataType, err erro
 	return shape, dt, fmt.Errorf("unsupported type %v", typ)
 }
 
-// typeOf converts from a DataType and Shape to the equivalent Go type.
-func typeOf(dt DataType, shape []int64) reflect.Type {
-	var ret reflect.Type
+func typeForDataType(dt DataType) reflect.Type {
 	for _, t := range types {
 		if dt == DataType(t.dataType) {
-			ret = t.typ
-			break
+			return t.typ
 		}
 	}
-	if ret == nil {
-		panic(bug("DataType %v is not supported (see https://www.tensorflow.org/code/tensorflow/core/framework/types.proto)", dt))
-	}
+	panic(bug("DataType %v is not supported (see https://www.tensorflow.org/code/tensorflow/core/framework/types.proto)", dt))
+}
+
+// typeOf converts from a DataType and Shape to the equivalent Go type.
+func typeOf(dt DataType, shape []int64) reflect.Type {
+	ret := typeForDataType(dt)
 	for range shape {
 		ret = reflect.SliceOf(ret)
 	}
@@ -411,45 +478,6 @@ func copyPtr(w *bytes.Buffer, ptr unsafe.Pointer, l int) error {
 	return err
 }
 
-// decodeTensor decodes the Tensor from the buffer to ptr using the format
-// specified in c_api.h. Use stringDecoder for String tensors.
-func decodeTensor(r *bytes.Reader, shape []int64, typ reflect.Type, ptr reflect.Value) error {
-	switch typ.Kind() {
-	case reflect.Bool:
-		b, err := r.ReadByte()
-		if err != nil {
-			return err
-		}
-		ptr.Elem().SetBool(b == 1)
-	case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
-		if err := binary.Read(r, nativeEndian, ptr.Interface()); err != nil {
-			return err
-		}
-
-	case reflect.Slice:
-		val := reflect.Indirect(ptr)
-		val.Set(reflect.MakeSlice(typ, int(shape[0]), int(shape[0])))
-
-		// Optimization: if only one dimension is left we can use binary.Read() directly for this slice
-		if len(shape) == 1 && val.Len() > 0 {
-			switch val.Index(0).Kind() {
-			case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
-				return binary.Read(r, nativeEndian, val.Interface())
-			}
-		}
-
-		for i := 0; i < val.Len(); i++ {
-			if err := decodeTensor(r, shape[1:], typ.Elem(), val.Index(i).Addr()); err != nil {
-				return err
-			}
-		}
-
-	default:
-		return fmt.Errorf("unsupported type %v", typ)
-	}
-	return nil
-}
-
 type stringEncoder struct {
 	offsets io.Writer
 	data    []byte

From fde6c13e2d31b6feec5a4eaf63a765edde9fd820 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= <mans.nilsson@arm.com>
Date: Mon, 10 Feb 2020 10:31:45 +0100
Subject: [PATCH 061/442] TFLu: Update stm32f4 target

Filter out failed test and increase RAM size.
---
 .../micro/tools/make/targets/stm32f4/stm32f4.lds  |  4 ++--
 .../micro/tools/make/targets/stm32f4_makefile.inc | 15 +++++++++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/micro/tools/make/targets/stm32f4/stm32f4.lds b/tensorflow/lite/micro/tools/make/targets/stm32f4/stm32f4.lds
index 6ecde0000b2..8e8b3f75448 100644
--- a/tensorflow/lite/micro/tools/make/targets/stm32f4/stm32f4.lds
+++ b/tensorflow/lite/micro/tools/make/targets/stm32f4/stm32f4.lds
@@ -30,9 +30,9 @@ limitations under the License.
 /* Define main entry point */
 ENTRY(_main)
 
-/* 20K of RAM and 128K of FLASH */
+/* 32K of RAM and 256K of FLASH */
 MEMORY {
-RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 20K
+RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 32K
 FLASH (rx) : ORIGIN = 0x8000000, LENGTH = 256K
 }
 
diff --git a/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc b/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
index 4df3e755934..f9451cc6db3 100644
--- a/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
@@ -62,6 +62,21 @@ ifeq ($(TARGET), stm32f4)
     tensorflow/lite/micro/micro_allocator_test.cc \
     tensorflow/lite/micro/memory_helpers_test.cc \
     tensorflow/lite/micro/kernels/depthwise_conv_test.cc \
+    tensorflow/lite/micro/kernels/logistic_test.cc \
+    tensorflow/lite/micro/kernels/logical_test.cc \
+    tensorflow/lite/micro/kernels/maximum_minimum_test.cc \
+    tensorflow/lite/micro/kernels/comparisons_test.cc \
+    tensorflow/lite/micro/kernels/reshape_test.cc \
+    tensorflow/lite/micro/kernels/arg_min_max_test.cc \
+    tensorflow/lite/micro/kernels/elementwise_test.cc \
+    tensorflow/lite/micro/kernels/strided_slice_test.cc \
+    tensorflow/lite/micro/kernels/prelu_test.cc \
+    tensorflow/lite/micro/kernels/pooling_test.cc \
+    tensorflow/lite/micro/kernels/pack_test.cc \
+    tensorflow/lite/micro/kernels/activations_test.cc \
+    tensorflow/lite/micro/kernels/dequantize_test.cc \
+    tensorflow/lite/micro/kernels/unpack_test.cc \
+    tensorflow/lite/micro/kernels/split_test.cc \
     tensorflow/lite/micro/kernels/conv_test.cc \
     tensorflow/lite/micro/simple_tensor_allocator_test.cc
   MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))

From 5234f66ecfcfa3835d7d86a47932838642c07c5b Mon Sep 17 00:00:00 2001
From: Puneeth K <puneethk.cs17@bmsce.ac.in>
Date: Mon, 10 Feb 2020 21:20:12 +0530
Subject: [PATCH 062/442] Added support for MutableMapping

---
 tensorflow/python/util/nest.py                      | 12 ++----------
 tensorflow/python/util/util.cc                      |  1 +
 tensorflow/tools/def_file_filter/symbols_pybind.txt |  1 +
 3 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index fa6f9a209c2..c27cb8bc2f8 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -117,7 +117,6 @@ def _is_namedtuple(instance, strict=False):
 
 
 # See the swig file (util.i) for documentation.
-_is_mapping = _pywrap_utils.IsMapping
 _is_mapping_view = _pywrap_utils.IsMappingView
 _is_attrs = _pywrap_utils.IsAttrs
 _is_composite_tensor = _pywrap_utils.IsCompositeTensor
@@ -146,15 +145,7 @@ def _sequence_like(instance, args):
     result = dict(zip(_sorted(instance), args))
     instance_type = type(instance)
     if instance_type == _collections.defaultdict:
-      d = instance_type()
-      for key in instance:
-        d[key] = result[key]
-      return d
-  elif _is_mapping(instance):
-    result = dict(zip(_sorted(instance), args))
-    instance_type = type(instance)
-    if instance_type == _collections.defaultdict:
-      d = _collections.defaultdict(instance.default_factory)
+      d = instance_type(_collections.defaultdict(instance.default_factory))
       for key in instance:
         d[key] = result[key]
       return d
@@ -1371,6 +1362,7 @@ list_to_tuple = _list_to_tuple
 
 
 _pywrap_utils.RegisterType("Mapping", _collections_abc.Mapping)
+_pywrap_utils.RegisterType("MutableMapping", _collections_abc.MutableMapping)
 _pywrap_utils.RegisterType("Sequence", _collections_abc.Sequence)
 _pywrap_utils.RegisterType("MappingView", _collections_abc.MappingView)
 _pywrap_utils.RegisterType("ObjectProxy", _wrapt.ObjectProxy)
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index cc163898d28..daee5c66771 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -888,6 +888,7 @@ bool AssertSameStructureHelper(
 
 bool IsSequence(PyObject* o) { return IsSequenceHelper(o) == 1; }
 bool IsMapping(PyObject* o) { return IsMappingHelper(o) == 1; }
+bool IsMutableMapping(PyObject* o){ return IsMutableMappingHelper(o) == 1; }
 bool IsMappingView(PyObject* o) { return IsMappingViewHelper(o) == 1; }
 bool IsAttrs(PyObject* o) { return IsAttrsHelper(o) == 1; }
 bool IsTensor(PyObject* o) { return IsTensorHelper(o) == 1; }
diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
index e657edc4fbf..b21c9195d76 100644
--- a/tensorflow/tools/def_file_filter/symbols_pybind.txt
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -5,6 +5,7 @@ tensorflow::swig::IsCompositeTensor
 tensorflow::swig::IsTypeSpec
 tensorflow::swig::IsNamedtuple
 tensorflow::swig::IsMapping
+tensorflow::swig::IsMutableMapping
 tensorflow::swig::IsMappingView
 tensorflow::swig::IsAttrs
 tensorflow::swig::IsTensor

From 66832a3986b65b41268d07b12090f0b4305db925 Mon Sep 17 00:00:00 2001
From: Lakshay Tokas <lakshay.tokas@intel.com>
Date: Mon, 10 Feb 2020 15:40:01 -0800
Subject: [PATCH 063/442] Added changes for DNN 0.9 to softmax, identity_op,
 and lrn ops.

---
 tensorflow/core/kernels/mkl_identity_op.cc |   4 +-
 tensorflow/core/kernels/mkl_lrn_op.cc      | 259 ++++++++++-----------
 tensorflow/core/kernels/mkl_softmax_op.cc  | 106 +++++----
 3 files changed, 190 insertions(+), 179 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl_identity_op.cc
index a2b6617ca61..7f6c255ac88 100644
--- a/tensorflow/core/kernels/mkl_identity_op.cc
+++ b/tensorflow/core/kernels/mkl_identity_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 // See docs in ../ops/array_ops.cc.
 #ifdef INTEL_MKL
 
+#include "mkldnn.hpp"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -23,8 +24,6 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
-
-#include "mkldnn.hpp"
 #include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
@@ -64,4 +63,5 @@ TF_CALL_float(REGISTER_MKL_CPU);
 TF_CALL_bfloat16(REGISTER_MKL_CPU);
 #undef REGISTER_MKL_CPU
 }  // namespace tensorflow
+
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index 93df6e1ae99..2b7323d12af 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -21,24 +21,26 @@ limitations under the License.
 #ifdef INTEL_MKL
 
 #define EIGEN_USE_THREADS
+
+#include <unordered_map>
 #include <vector>
 
 #include "mkldnn.hpp"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/util/work_sharder.h"
 #endif
 
-using mkldnn::lrn_across_channels;
 using mkldnn::lrn_backward;
 using mkldnn::lrn_forward;
 using mkldnn::prop_kind;
@@ -69,14 +71,14 @@ class MklLRNOp : public OpKernel {
  public:
   ~MklLRNOp() {}
 
-  explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) {
+  explicit MklLRNOp(OpKernelConstruction* context)
+      : OpKernel(context), cpu_engine_(ENGINE_CPU, 0) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                " larger than int max"));
+    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                        " larger than int max"));
     depth_radius_ = static_cast<size_t>(depth_radius64);
 
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
@@ -85,6 +87,7 @@ class MklLRNOp : public OpKernel {
     workspace_enabled_ = false;
     OP_REQUIRES_OK(context,
                    context->GetAttr("workspace_enabled", &workspace_enabled_));
+    fwd_stream_.reset(new CPU_STREAM(cpu_engine_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -92,7 +95,6 @@ class MklLRNOp : public OpKernel {
       SanityCheckInputs(context);
       if (!context->status().ok()) return;
 
-      auto cpu_engine = engine(engine::cpu, 0);
       const Tensor& src_tensor = MklGetInput(context, kIdxInput);
       MklDnnShape src_dnn_shape;
       GetMklShape(context, kIdxInput, &src_dnn_shape);
@@ -120,9 +122,9 @@ class MklLRNOp : public OpKernel {
       // and we can enable the workspace
       workspace_enabled_ = true;
 
-      MklDnnData<T> src_dnn_data(&cpu_engine);
-      MklDnnData<T> dst_dnn_data(&cpu_engine);
-      MklDnnData<uint8> workspace_dnn_data(&cpu_engine);
+      MklDnnData<T> src_dnn_data(&cpu_engine_);
+      MklDnnData<T> dst_dnn_data(&cpu_engine_);
+      MklDnnData<uint8> workspace_dnn_data(&cpu_engine_);
 
       TensorShape tf_output_shape = src_tensor.shape();
 
@@ -134,39 +136,57 @@ class MklLRNOp : public OpKernel {
       // and MKL-DNN performs normalization over Channel, we tell MKL-DNN
       // that input is in NHWC layout with Channel being the last dimension.
       src_dnn_data.SetUsrMem(src_md, &src_tensor);
-      src_dnn_data.SetOpMemDesc(input_dims, memory::format::nhwc);
+      src_dnn_data.SetOpMemDesc(input_dims, MEMORY_FORMAT::nhwc);
 
-      // output_dnn_data and workspace both have the same shape as input
+      // dst_dnn_data has the same shape as input.
       dst_dnn_data.SetUsrMem(src_md);
-      dst_dnn_data.SetOpMemDesc(input_dims, memory::format::nhwc);
+      dst_dnn_data.SetOpMemDesc(input_dims, MEMORY_FORMAT::nhwc);
 
       // Create LRN primitive descriptor.
       // Tensorflow's normalization semantics is across channels.
       // MKL-DNN also supports normalization within channel.
-      auto lrn_desc = lrn_forward::desc(prop_kind::forward, lrn_across_channels,
-                                        src_dnn_data.GetUsrMemDesc(),
-                                        kernel_size, new_alpha, beta_, bias_);
-      auto lrn_prim_desc = lrn_forward::primitive_desc(lrn_desc, cpu_engine);
+      auto lrn_desc = lrn_forward::desc(
+          prop_kind::forward, ALGORITHM::lrn_across_channels,
+          src_dnn_data.GetUsrMemDesc(), kernel_size, new_alpha, beta_, bias_);
+      auto lrn_prim_desc = lrn_forward::primitive_desc(lrn_desc, cpu_engine_);
 
       // Allocate output_dnn_data tensor.
       Tensor* output_tensor = nullptr;
-      memory::format input_format = src_dnn_shape.GetTfDataFormat();
+      auto input_format = src_dnn_shape.GetTfDataFormat();
       AllocateOutputTensor(context, lrn_prim_desc, input_dims, input_format,
                            &output_tensor);
       OP_REQUIRES_OK(context, context->status());
-      CHECK_NOTNULL(output_tensor);
+      DCHECK(output_tensor != nullptr);
       dst_dnn_data.SetUsrMemDataHandle(output_tensor);
 
       // Handle workspace required for MKL-DNN.
       AllocateWorkspaceTensor(context, lrn_prim_desc, &workspace_dnn_data);
       OP_REQUIRES_OK(context, context->status());
 
-      PrepareAndExecuteNet(lrn_prim_desc, &src_dnn_data, &dst_dnn_data,
-                           &workspace_dnn_data);
+      // Check for input reorder
+      src_dnn_data.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
+          lrn_prim_desc.PRIMITIVE_DESC_SRC, cpu_engine_));
+
+      std::vector<primitive> net;
+#ifdef ENABLE_MKLDNN_V1
+      net.push_back(lrn_forward(lrn_prim_desc));
+      std::vector<std::unordered_map<int, memory>> net_args;
+      net_args.push_back({{MKLDNN_ARG_SRC, src_dnn_data.GetOpMem()},
+                          {MKLDNN_ARG_WORKSPACE, workspace_dnn_data.GetOpMem()},
+                          { MKLDNN_ARG_DST,
+                            dst_dnn_data.GetOpMem() }});
+      net.push_back(lrn_forward(lrn_prim_desc));
+      net.at(0).execute(*fwd_stream_, net_args.at(0));
+#else
+      net.push_back(lrn_forward(lrn_prim_desc, src_dnn_data.GetOpMem(),
+                                workspace_dnn_data.GetOpMem(),
+                                dst_dnn_data.GetOpMem()));
+      fwd_stream_->submit(net).wait();
+#endif
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           context,
           errors::Aborted("Operation received an exception:", error_msg));
@@ -174,33 +194,13 @@ class MklLRNOp : public OpKernel {
   }
 
  private:
-  void PrepareAndExecuteNet(const lrn_forward::primitive_desc& lrn_fwd_desc,
-                            MklDnnData<T>* src_dnn_data,
-                            MklDnnData<T>* dst_dnn_data,
-                            MklDnnData<uint8>* wksp_dnn_data = nullptr) {
-    // Check for input reorder
-    src_dnn_data->CheckReorderToOpMem(lrn_fwd_desc.src_primitive_desc());
-
-    // Create pooling primitive and add it to net
-    std::vector<primitive> net;
-    if (wksp_dnn_data != nullptr) {
-      net.push_back(lrn_forward(lrn_fwd_desc, src_dnn_data->GetOpMem(),
-                                wksp_dnn_data->GetOpMem(),
-                                dst_dnn_data->GetOpMem()));
-    } else {
-      net.push_back(lrn_forward(lrn_fwd_desc, src_dnn_data->GetOpMem(),
-                                dst_dnn_data->GetOpMem()));
-    }
-    stream(stream::kind::eager).submit(net).wait();
-  }
-
   void AllocateOutputTensor(
       OpKernelContext* context,
       const lrn_forward::primitive_desc& lrn_fwd_prim_desc,
       const memory::dims output_dims_mkl_order,
-      const memory::format& output_tf_format, Tensor** output_tensor) {
-    CHECK_NOTNULL(output_tensor);
-    memory::primitive_desc dst_pd = lrn_fwd_prim_desc.dst_primitive_desc();
+      const MKL_TENSOR_FORMAT& output_tf_format, Tensor** output_tensor) {
+    DCHECK(output_tensor != nullptr);
+    MEMORY_PRIMITIVE_DESC dst_pd = lrn_fwd_prim_desc.PRIMITIVE_DESC_DST;
 
     MklDnnShape output_mkl_shape;
     // We only handle the case when the inputs and output are in Mkl format
@@ -231,8 +231,7 @@ class MklLRNOp : public OpKernel {
 
     auto in_shaped = input.shaped<T, 2>({nodes * batch, depth});
     // Multiplying the input with the band matrix has the effect of reducing
-    // the
-    // correct patch along the depth.
+    // the correct patch along the depth.
     Eigen::Tensor<T, 2, Eigen::RowMajor> multiplier(depth, depth);
     GetBandMatrix<T>(depth, depth_radius_, &multiplier);
 
@@ -242,7 +241,7 @@ class MklLRNOp : public OpKernel {
     mkl_output_mkl_shape.SetDimensions(4);
     AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data,
                               input.shape(), mkl_output_mkl_shape);
-    CHECK_NOTNULL(output_dnn_data);
+    DCHECK(output_dnn_data != nullptr);
 
     Tensor* workspace_tensor = nullptr;
     MklDnnShape workspace_mkl_shape;
@@ -251,7 +250,7 @@ class MklLRNOp : public OpKernel {
     workspace_tf_shape.AddDim(0);
     AllocateOutputSetMklShape(context, kIdxWorkspace, &workspace_tensor,
                               workspace_tf_shape, workspace_mkl_shape);
-    CHECK_NOTNULL(workspace_tensor);
+    DCHECK(workspace_tensor);
 
     auto out_shaped = output_dnn_data->shaped<T, 2>({nodes * batch, depth});
     Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
@@ -271,10 +270,10 @@ class MklLRNOp : public OpKernel {
       OpKernelContext* context,
       const lrn_forward::primitive_desc& lrn_fwd_prim_desc,
       MklDnnData<uint8>* dnn_data_wksp) {
-    CHECK_NOTNULL(dnn_data_wksp);
+    DCHECK(dnn_data_wksp != nullptr);
     Tensor* workspace_tensor = nullptr;
-    memory::primitive_desc workspace_pd =
-        lrn_fwd_prim_desc.workspace_primitive_desc();
+    MEMORY_PRIMITIVE_DESC workspace_pd =
+        lrn_fwd_prim_desc.PRIMITIVE_DESC_WORKSPACE;
     size_t workspace_bytes = workspace_pd.get_size();
     MklDnnShape workspace_mkl_shape;
     // the workspace tensor is a uint8 tensor that has
@@ -284,7 +283,7 @@ class MklLRNOp : public OpKernel {
     workspace_tf_shape.AddDim(workspace_bytes);
     AllocateOutputSetMklShape(context, kIdxWorkspace, &workspace_tensor,
                               workspace_tf_shape, workspace_mkl_shape);
-    CHECK_NOTNULL(workspace_tensor);
+    DCHECK(workspace_tensor != nullptr);
     dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
   }
 
@@ -295,16 +294,14 @@ class MklLRNOp : public OpKernel {
     if (src_dnn_shape.IsMklTensor()) {
       OP_REQUIRES(context, src_dnn_shape.GetDimension() == 4,
                   errors::InvalidArgument("input must be 4-dimensional"));
-      OP_REQUIRES(context,
-                  FastBoundsCheck(src_tensor.NumElements(),
-                                  std::numeric_limits<int>::max()),
+      OP_REQUIRES(context, FastBoundsCheck(src_tensor.NumElements(),
+                                           std::numeric_limits<int>::max()),
                   errors::InvalidArgument("argument to LRN too large"));
     } else {
       OP_REQUIRES(context, src_tensor.dims() == 4,
                   errors::InvalidArgument("input must be 4-dimensional"));
-      OP_REQUIRES(context,
-                  FastBoundsCheck(src_tensor.NumElements(),
-                                  std::numeric_limits<int>::max()),
+      OP_REQUIRES(context, FastBoundsCheck(src_tensor.NumElements(),
+                                           std::numeric_limits<int>::max()),
                   errors::InvalidArgument("argument to LRN too large"));
     }
   }
@@ -316,19 +313,21 @@ class MklLRNOp : public OpKernel {
   float bias_;
   float alpha_;
   float beta_;
+  engine cpu_engine_;
+  std::shared_ptr<stream> fwd_stream_;
 };
 
 template <typename T>
 class MklLRNGradOp : public OpKernel {
  public:
-  explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
+  explicit MklLRNGradOp(OpKernelConstruction* context)
+      : OpKernel(context), cpu_engine_(ENGINE_CPU, 0) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                " larger than int max"));
+    OP_REQUIRES(context, FastBoundsCheck(depth_radius64,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("depth_radius = ", depth_radius64,
+                                        " larger than int max"));
     depth_radius_ = static_cast<int>(depth_radius64);
     OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
     OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
@@ -336,6 +335,7 @@ class MklLRNGradOp : public OpKernel {
     workspace_enabled_ = false;
     OP_REQUIRES_OK(context,
                    context->GetAttr("workspace_enabled", &workspace_enabled_));
+    bwd_stream_.reset(new CPU_STREAM(cpu_engine_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -343,11 +343,10 @@ class MklLRNGradOp : public OpKernel {
       SanityCheckInputs(context);
       if (!context->status().ok()) return;
 
-      auto cpu_engine = engine(engine::cpu, 0);
-      MklDnnData<T> input_grad_dnn_data(&cpu_engine);
-      MklDnnData<T> orig_input_dnn_data(&cpu_engine);
-      MklDnnData<T> orig_output_dnn_data(&cpu_engine);
-      MklDnnData<T> output_dnn_data(&cpu_engine);
+      MklDnnData<T> input_grad_dnn_data(&cpu_engine_);
+      MklDnnData<T> orig_input_dnn_data(&cpu_engine_);
+      MklDnnData<T> orig_output_dnn_data(&cpu_engine_);
+      MklDnnData<T> output_dnn_data(&cpu_engine_);
 
       MklDnnShape input_grad_dnn_shape, orig_input_dnn_shape,
           orig_output_dnn_shape;
@@ -389,11 +388,11 @@ class MklLRNGradOp : public OpKernel {
       memory::dims orig_input_dims =
           orig_input_dnn_shape.GetSizesAsMklDnnDims();
       orig_input_dnn_data.SetUsrMem(orig_input_md, &orig_input_tensor);
-      orig_input_dnn_data.SetOpMemDesc(orig_input_dims, memory::format::nhwc);
+      orig_input_dnn_data.SetOpMemDesc(orig_input_dims, MEMORY_FORMAT::nhwc);
 
       // output_dnn_data has the same shape as original input
       output_dnn_data.SetUsrMem(orig_input_md);
-      output_dnn_data.SetOpMemDesc(orig_input_dims, memory::format::nhwc);
+      output_dnn_data.SetOpMemDesc(orig_input_dims, MEMORY_FORMAT::nhwc);
 
       // MKL-DNN has a notion of kernel_size and not depth_radius.
       int kernel_size = 2 * depth_radius_ + 1;
@@ -402,42 +401,61 @@ class MklLRNGradOp : public OpKernel {
       // Create LRN backward primitive descriptor. It requires LRN forward
       // primitive descriptor also.
       auto lrn_fwd_desc = lrn_forward::desc(
-          prop_kind::forward, lrn_across_channels, orig_input_md, kernel_size,
-          new_alpha, beta_, bias_);
-      auto lrn_fwd_prim_desc =
-          lrn_forward::primitive_desc(lrn_fwd_desc, cpu_engine);
-      auto lrn_bwd_desc = lrn_backward::desc(
-          lrn_across_channels, original_output_md, target_diff_dst_md,
+          prop_kind::forward, ALGORITHM::lrn_across_channels, orig_input_md,
           kernel_size, new_alpha, beta_, bias_);
+      auto lrn_fwd_prim_desc =
+          lrn_forward::primitive_desc(lrn_fwd_desc, cpu_engine_);
+      auto lrn_bwd_desc = lrn_backward::desc(
+          ALGORITHM::lrn_across_channels, original_output_md,
+          target_diff_dst_md, kernel_size, new_alpha, beta_, bias_);
       auto lrn_bwd_prim_desc = lrn_backward::primitive_desc(
-          lrn_bwd_desc, cpu_engine, lrn_fwd_prim_desc);
+          lrn_bwd_desc, cpu_engine_, lrn_fwd_prim_desc);
 
       Tensor* output_tensor = nullptr;
-      memory::format orig_input_format = orig_input_dnn_shape.GetTfDataFormat();
+      auto orig_input_format = orig_input_dnn_shape.GetTfDataFormat();
       AllocateOutputTensor(context, lrn_bwd_prim_desc, orig_input_dims,
                            orig_input_format, &output_tensor);
       OP_REQUIRES_OK(context, context->status());
-      CHECK_NOTNULL(output_tensor);
+      DCHECK(output_tensor != nullptr);
       output_dnn_data.SetUsrMemDataHandle(output_tensor);
 
       // Create LRN primitive and add it to the net
       // At this point, workspace is enabled, so we don't need
       // to check. Pass input workspace to LRN backward primitive.
       const Tensor& workspace_tensor = MklGetInput(context, kIdxWorkspace);
-      MklDnnData<uint8> workspace_dnn_data(&cpu_engine);
+      MklDnnData<uint8> workspace_dnn_data(&cpu_engine_);
       ConfigureWorkspace(workspace_tensor,
-                         lrn_fwd_prim_desc.workspace_primitive_desc(),
+                         lrn_fwd_prim_desc.PRIMITIVE_DESC_WORKSPACE,
                          &workspace_dnn_data);
 
-      PrepareAndExecuteNet(
-          lrn_bwd_prim_desc, lrn_fwd_prim_desc, &orig_input_dnn_data,
-          &input_grad_dnn_data, &output_dnn_data,
-          memory::primitive_desc(target_diff_dst_md, cpu_engine),
-          &workspace_dnn_data);
+      // Check for input reordering on the diff dst input
+      input_grad_dnn_data.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
+          lrn_bwd_prim_desc.PRIMITIVE_DESC_DIFF_DST, cpu_engine_));
+
+      // Check for input reordering on the original input
+      orig_input_dnn_data.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
+          lrn_fwd_prim_desc.PRIMITIVE_DESC_SRC, cpu_engine_));
+
+      std::vector<primitive> net;
+#ifdef ENABLE_MKLDNN_V1
+      std::vector<std::unordered_map<int, memory>> net_args;
+      net.push_back(lrn_backward(lrn_bwd_prim_desc));
+      net_args.push_back({{MKLDNN_ARG_SRC, orig_input_dnn_data.GetOpMem()},
+                          {MKLDNN_ARG_DIFF_DST, input_grad_dnn_data.GetOpMem()},
+                          { MKLDNN_ARG_DST,
+                            output_dnn_data.GetOpMem() }});
+      net.push_back(lrn_backward(lrn_bwd_prim_desc));
+      net.at(0).execute(*bwd_stream_, net_args.at(0));
+#else
+      net.push_back(lrn_backward(
+          lrn_bwd_prim_desc, orig_input_dnn_data.GetOpMem(),
+          input_grad_dnn_data.GetOpMem(), output_dnn_data.GetOpMem()));
+      bwd_stream_->submit(net).wait();
+#endif
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           context,
           errors::Aborted("Operation received an exception:", error_msg));
@@ -448,10 +466,9 @@ class MklLRNGradOp : public OpKernel {
       OpKernelContext* context,
       const lrn_backward::primitive_desc& lrn_bkwd_prim_desc,
       const memory::dims output_dims_mkl_order,
-      const memory::format& output_tf_format, Tensor** output_tensor) {
-    CHECK_NOTNULL(output_tensor);
-    memory::primitive_desc dst_pd =
-        lrn_bkwd_prim_desc.diff_src_primitive_desc();
+      const MKL_TENSOR_FORMAT& output_tf_format, Tensor** output_tensor) {
+    DCHECK(output_tensor != nullptr);
+    MEMORY_PRIMITIVE_DESC dst_pd = lrn_bkwd_prim_desc.PRIMITIVE_DESC_DIFF_SRC;
     MklDnnShape output_mkl_shape;
 
     // We assume that all outputs at this point are MKL Tensors
@@ -472,56 +489,28 @@ class MklLRNGradOp : public OpKernel {
   memory::desc ConfigureInputGradient(const Tensor& input_grad_tensor,
                                       const MklDnnShape& input_grad_dnn_shape,
                                       MklDnnData<T>* input_grad_dnn_data) {
-    CHECK_NOTNULL(input_grad_dnn_data);
+    DCHECK(input_grad_dnn_data != nullptr);
     // This shouldn't be necessary at this point, but just in case
-    CHECK_EQ(input_grad_dnn_shape.IsMklTensor(), true);
+    DCHECK(input_grad_dnn_shape.IsMklTensor() == true);
 
     memory::desc input_grad_md = input_grad_dnn_shape.GetCurLayout();
     memory::dims orig_input_dims = input_grad_dnn_shape.GetSizesAsMklDnnDims();
     input_grad_dnn_data->SetUsrMem(input_grad_md, &input_grad_tensor);
-    input_grad_dnn_data->SetOpMemDesc(orig_input_dims, memory::format::nhwc);
+    input_grad_dnn_data->SetOpMemDesc(orig_input_dims, MEMORY_FORMAT::nhwc);
     return input_grad_md;
   }
 
-  void PrepareAndExecuteNet(
-      const lrn_backward::primitive_desc& lrn_bkwd_desc,
-      const lrn_forward::primitive_desc& lrn_fwd_desc,
-      MklDnnData<T>* src_dnn_data, MklDnnData<T>* input_gradient_diff_dst,
-      MklDnnData<T>* output_diff_src,
-      const memory::primitive_desc& target_diff_dst_pd,
-      const MklDnnData<uint8>* workspace_dnn_data = nullptr) {
-    // Check for input reordering on the diff dst input
-    input_gradient_diff_dst->CheckReorderToOpMem(
-        lrn_bkwd_desc.diff_dst_primitive_desc());
-
-    // Check for input reordering on the original input
-    src_dnn_data->CheckReorderToOpMem(lrn_fwd_desc.src_primitive_desc());
-    // Create pooling primitive and add it to net
-    std::vector<primitive> net;
-    if (nullptr == workspace_dnn_data) {
-      net.push_back(lrn_backward(lrn_bkwd_desc, src_dnn_data->GetOpMem(),
-                                 input_gradient_diff_dst->GetOpMem(),
-                                 output_diff_src->GetOpMem()));
-    } else {
-      net.push_back(lrn_backward(lrn_bkwd_desc, src_dnn_data->GetOpMem(),
-                                 input_gradient_diff_dst->GetOpMem(),
-                                 workspace_dnn_data->GetOpMem(),
-                                 output_diff_src->GetOpMem()));
-    }
-    stream(stream::kind::eager).submit(net).wait();
-  }
-
   void ConfigureWorkspace(const Tensor& workspace_tensor,
-                          memory::primitive_desc workspace_pd,
+                          MEMORY_PRIMITIVE_DESC workspace_pd,
                           MklDnnData<uint8>* workspace_dnn_data) {
-    CHECK_NOTNULL(workspace_dnn_data);
+    DCHECK(workspace_dnn_data);
 
     workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor);
   }
 
   // Fallback implementation - Taken from lrn_op.cc
-  // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a
-  // copy.
+  // TODO(intel-tf) Check if we can use EigenLRNOp directly
+  // instead of making a copy.
   void MklDefaultToEigen(OpKernelContext* context) {
     Tensor input_gradient_tensor;
     Tensor orig_input_tensor;
@@ -676,6 +665,8 @@ class MklLRNGradOp : public OpKernel {
   float bias_;
   float alpha_;
   float beta_;
+  engine cpu_engine_;
+  std::shared_ptr<stream> bwd_stream_;
 };
 
 #define REGISTER_MKL_LRN_CPU(T)                                \
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index d3645b948dc..b9f8e590d0e 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -14,17 +14,19 @@ limitations under the License.
 ==============================================================================*/
 
 // See docs in ../ops/nn_ops.cc.
+
 #ifdef INTEL_MKL
 
 #include "mkldnn.hpp"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 using mkldnn::prop_kind;
 using mkldnn::softmax_forward;
@@ -35,10 +37,10 @@ namespace tensorflow {
 class MklSoftmaxParams {
  public:
   memory::dims src_dims;
-  memory::format src_fmt;
+  MKL_TENSOR_FORMAT src_fmt;
   int axis;
 
-  MklSoftmaxParams(memory::dims src_dims, memory::format src_fmt, int axis)
+  MklSoftmaxParams(memory::dims src_dims, MKL_TENSOR_FORMAT src_fmt, int axis)
       : src_dims(src_dims), src_fmt(src_fmt), axis(axis) {}
 };
 
@@ -46,8 +48,8 @@ template <typename T>
 class MklSoftmaxPrimitive : public MklPrimitive {
  public:
   explicit MklSoftmaxPrimitive(const MklSoftmaxParams& fwdParams)
-      : cpu_engine_(engine::cpu, 0) {
-    context_.fwd_stream.reset(new stream(stream::kind::eager));
+      : cpu_engine_(ENGINE_CPU, 0) {
+    context_.fwd_stream.reset(new CPU_STREAM(cpu_engine_));
     Setup(fwdParams);
   }
 
@@ -61,9 +63,18 @@ class MklSoftmaxPrimitive : public MklPrimitive {
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
 
+#ifdef ENABLE_MKLDNN_V1
+    DCHECK_EQ(context_.fwd_primitives.size(),
+              context_.fwd_net_args.size());
+    for (size_t i = 0; i < context_.fwd_primitives.size(); ++i) {
+      context_.fwd_primitives.at(i).execute(*context_.fwd_stream,
+                                            context_.fwd_net_args.at(i));
+    }
+#else
     context_.fwd_stream->submit(context_.fwd_primitives);
+#endif
 
-    // After execution, set data handle back
+    // After execution, set data handle back.
     context_.src_mem->set_data_handle(DummyData);
     context_.dst_mem->set_data_handle(DummyData);
   }
@@ -74,22 +85,23 @@ class MklSoftmaxPrimitive : public MklPrimitive {
 
  private:
   struct SoftmaxFwdContext {
-    // MKL-DNN memory
+    // MKL-DNN memory.
     std::shared_ptr<memory> src_mem;
     std::shared_ptr<memory> dst_mem;
 
-    // Primitive desc
+    // Primitive descriptor.
     std::shared_ptr<mkldnn::softmax_forward::desc> fwd_desc;
 
-    // Memory desc
+    // Memory descriptor.
     std::shared_ptr<memory::desc> src_md;
 
-    // Softmax primitive
+    // Softmax primitive.
     std::shared_ptr<mkldnn::softmax_forward::primitive_desc> fwd_pd;
     std::shared_ptr<mkldnn::primitive> softmax_fwd;
 
     std::shared_ptr<stream> fwd_stream;
     std::vector<mkldnn::primitive> fwd_primitives;
+    std::vector<MemoryArgsMap> fwd_net_args;
 
     SoftmaxFwdContext()
         : src_mem(nullptr),
@@ -103,25 +115,33 @@ class MklSoftmaxPrimitive : public MklPrimitive {
 
   // Softmax forward primitive setup
   void Setup(const MklSoftmaxParams& fwdParams) {
-    // Create memory descriptors for softmax data with specified format
-    context_.src_md.reset(new memory::desc({fwdParams.src_dims},
-                                           MklDnnType<T>(), fwdParams.src_fmt));
+    // Create memory descriptors for softmax data with specified format.
+    auto src_format = GET_TENSOR_FORMAT(fwdParams.src_fmt);
+    context_.src_md.reset(
+        new memory::desc({fwdParams.src_dims}, MklDnnType<T>(), src_format));
 
-    // Create a softmax
+    // Create softmax decriptor and primitive descriptor.
     context_.fwd_desc.reset(new mkldnn::softmax_forward::desc(
         prop_kind::forward_scoring, *context_.src_md, fwdParams.axis));
     context_.fwd_pd.reset(new mkldnn::softmax_forward::primitive_desc(
         *context_.fwd_desc, cpu_engine_));
 
-    // Create memory primitive based on dummy data
-    context_.src_mem.reset(
-        new memory({*context_.src_md, cpu_engine_}, DummyData));
-    context_.dst_mem.reset(
-        new memory(context_.fwd_pd.get()->dst_primitive_desc(), DummyData));
+    // Create memory primitive based on dummy data.
+    context_.src_mem.reset(new MEMORY_CONSTRUCTOR_USING_MD(
+        *context_.src_md, cpu_engine_, DummyData));
+    context_.dst_mem.reset(new MEMORY_CONSTRUCTOR_PD(
+        context_.fwd_pd.get()->PRIMITIVE_DESC_DST, cpu_engine_, DummyData));
 
+#ifdef ENABLE_MKLDNN_V1
     // Create softmax primitive and add it to net
+    context_.softmax_fwd.reset(new mkldnn::softmax_forward(*context_.fwd_pd));
+    context_.fwd_net_args.push_back({{MKLDNN_ARG_SRC, *context_.src_mem},
+                                            { MKLDNN_ARG_DST,
+                                              *context_.dst_mem }});
+#else
     context_.softmax_fwd.reset(new mkldnn::softmax_forward(
         *context_.fwd_pd, *context_.src_mem, *context_.dst_mem));
+#endif  // ENABLE_MKLDNN_V1
 
     context_.fwd_primitives.push_back(*context_.softmax_fwd);
   }
@@ -134,7 +154,7 @@ template <typename T>
 class MklSoftmaxPrimitiveFactory : public MklPrimitiveFactory<T> {
  public:
   static MklSoftmaxPrimitive<T>* Get(const MklSoftmaxParams& fwdParams) {
-    // Get a softmax fwd primitive from the cached pool
+    // Get a softmax fwd primitive from the cached pool.
     MklSoftmaxPrimitive<T>* softmax_forward =
         static_cast<MklSoftmaxPrimitive<T>*>(
             MklSoftmaxPrimitiveFactory<T>::GetInstance().GetSoftmaxFwd(
@@ -189,15 +209,15 @@ class MklSoftmaxOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     try {
-      // src_tensor now points to the 0-th input of global data struct "context"
+      auto cpu_engine = engine(ENGINE_CPU, 0);
+      // src_tensor points to the 0-th input of global data struct "context".
       size_t src_idx = 0;
       const Tensor& src_tensor = MklGetInput(context, src_idx);
-      // Add: get MklShape
       MklDnnShape src_mkl_shape;
       GetMklShape(context, src_idx, &src_mkl_shape);
 
-      // src_dims is the dimension of src_tensor
-      // dim of the dst will also be same as src_dims
+      // src_dims is the dimension of src_tensor.
+      // Dim of the dst will also be same as src_dims.
       auto src_tf_shape = src_mkl_shape.IsMklTensor()
                               ? src_mkl_shape.GetTfShape()
                               : src_tensor.shape();
@@ -211,7 +231,7 @@ class MklSoftmaxOp : public OpKernel {
         src_dims = TFShapeToMklDnnDims(src_tf_shape);
         axis = input_dims - 1;
       }
-      memory::format layout_type;
+      MKL_TENSOR_FORMAT layout_type;
       // In MKL, data format passed to mkl softmax op depends on dimension of
       // the input tensor. Here "x" data format in MKL is used for 1 dim tensor,
       // "nc" for 2 dim tensor, "tnc" for 3 dim tensor, "nchw" for 4 dim tensor,
@@ -223,26 +243,26 @@ class MklSoftmaxOp : public OpKernel {
       // dimension to do softmax.
       switch (input_dims) {
         case 1:
-          layout_type = memory::format::x;
+          layout_type = MKL_TENSOR_FORMAT_X;
           break;
         case 2:
-          layout_type = memory::format::nc;
+          layout_type = MKL_TENSOR_FORMAT_NC;
           break;
         case 3:
-          layout_type = memory::format::tnc;
+          layout_type = MKL_TENSOR_FORMAT_TNC;
           break;
         case 4:
           if (src_mkl_shape.IsMklTensor()) {
-            layout_type = memory::format::nhwc;
+            layout_type = MKL_TENSOR_FORMAT_NHWC;
           } else {
-            layout_type = memory::format::nchw;
+            layout_type = MKL_TENSOR_FORMAT_NCHW;
           }
           break;
         case 5:
           if (src_mkl_shape.IsMklTensor()) {
-            layout_type = memory::format::ndhwc;
+            layout_type = MKL_TENSOR_FORMAT_NDHWC;
           } else {
-            layout_type = memory::format::ncdhw;
+            layout_type = MKL_TENSOR_FORMAT_NCDHW;
           }
           break;
         default:
@@ -254,21 +274,20 @@ class MklSoftmaxOp : public OpKernel {
       // If input is in MKL layout, then simply get the format from input;
       // otherwise, use TF layout defined before.
       auto src_fmt = src_mkl_shape.IsMklTensor()
-                         ? static_cast<mkldnn::memory::format>(
-                               src_mkl_shape.GetMklLayout().data.format)
+                         ? GET_FORMAT_FROM_SHAPE(src_mkl_shape)
                          : layout_type;
 
-      // Get a softmax fwd from primitive pool
+      // Get a softmax fwd primitive from primitive pool.
       MklSoftmaxParams fwdParams(src_dims, src_fmt, axis);
       MklSoftmaxPrimitive<T>* softmax_fwd =
           MklSoftmaxPrimitiveFactory<T>::Get(fwdParams);
 
-      // Add output
+      // Prepare for creating output tensor.
       Tensor* output_tensor = nullptr;
       MklDnnShape output_mkl_shape;
       TensorShape output_tf_shape;  // shape of output TF tensor.
 
-      auto dst_pd = softmax_fwd->GetSoftmaxFwdPd()->dst_primitive_desc();
+      auto dst_pd = softmax_fwd->GetSoftmaxFwdPd()->PRIMITIVE_DESC_DST;
 
       // If input is MKL shape, output is also MKL shape.
       // If input is TF shape, output is also TF shape.
@@ -278,23 +297,23 @@ class MklSoftmaxOp : public OpKernel {
         output_mkl_shape.SetElemType(MklDnnType<T>());
         output_mkl_shape.SetTfLayout(src_dims.size(), src_dims, layout_type);
         output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T)));
-      } else {  // then output is also TF shape
+      } else {
         output_mkl_shape.SetMklTensor(false);
         output_tf_shape = MklDnnDimsToTFShape(src_dims);
       }
-      // Allocate output shape (MKL or TF based on the above)
+      // Allocate output tensor.
       AllocateOutputSetMklShape(context, 0, &output_tensor, output_tf_shape,
                                 output_mkl_shape);
 
       const T* src_data = src_tensor.flat<T>().data();
       T* dst_data = reinterpret_cast<T*>(output_tensor->flat<T>().data());
 
-      // Execute softmax
+      // Execute softmax primitive.
       softmax_fwd->Execute(src_data, dst_data);
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           context,
           errors::Aborted("Operation received an exception:", error_msg));
@@ -311,6 +330,7 @@ class MklSoftmaxOp : public OpKernel {
           .TypeConstraint<type>("T")                           \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
       MklSoftmaxOp<CPUDevice, type>);
+
 TF_CALL_float(REGISTER_SOFTMAX_MKL_SUPPORTED_KERNELS_TYPES);
 TF_CALL_bfloat16(REGISTER_SOFTMAX_MKL_SUPPORTED_KERNELS_TYPES);
 

From fd0d5adaae9ab198623ed1c334cd8b3a09a934cf Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Sun, 2 Feb 2020 10:32:50 +0100
Subject: [PATCH 064/442] Introduce TrtShapeOptimizationProfile class and use
 it in TRTEngineOp

---
 tensorflow/compiler/tf2tensorrt/BUILD         |  17 ++
 .../tf2tensorrt/convert/convert_graph.cc      |   3 +-
 .../tf2tensorrt/convert/convert_nodes.cc      |  19 +-
 .../tf2tensorrt/convert/convert_nodes.h       |   7 +-
 .../tf2tensorrt/convert/convert_nodes_test.cc |   5 +-
 .../tf2tensorrt/kernels/trt_engine_op.cc      |  57 +++--
 .../tf2tensorrt/utils/trt_lru_cache.h         |   6 +
 .../utils/trt_shape_optimization_profiles.cc  | 177 +++++++++++++++
 .../utils/trt_shape_optimization_profiles.h   | 179 +++++++++++++++
 .../trt_shape_optimization_profiles_test.cc   | 214 ++++++++++++++++++
 10 files changed, 656 insertions(+), 28 deletions(-)
 create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
 create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
 create mode 100644 tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc

diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index 65679bd021a..8427c288225 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -242,10 +242,12 @@ tf_cuda_library(
     srcs = [
         "utils/trt_int8_calibrator.cc",
         "utils/trt_lru_cache.cc",
+        "utils/trt_shape_optimization_profiles.cc",
     ],
     hdrs = [
         "utils/trt_int8_calibrator.h",
         "utils/trt_lru_cache.h",
+        "utils/trt_shape_optimization_profiles.h",
     ],
     deps = [
         ":trt_allocator",
@@ -301,6 +303,21 @@ tf_cc_test(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "trt_shape_optimization_profiles_test",
+    size = "small",
+    srcs = ["utils/trt_shape_optimization_profiles_test.cc"],
+    tags = [
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":trt_resources",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cuda_library(
     name = "logger_registry",
     srcs = ["convert/logger_registry.cc"],
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index 1bcc2c044f0..b27ba068de2 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -431,7 +431,8 @@ Status CreateTRTNode(const ConversionParams& params,
         calibrate_int8 ? TrtPrecisionMode::FP32 : info.precision_mode,
         max_batch_size, info.max_workspace_size_bytes, input_shapes, trt_logger,
         alloc, /*calibrator=*/nullptr, &engine, info.use_calibration,
-        params.use_implicit_batch, /*convert_successfully=*/nullptr));
+        params.use_implicit_batch, /*convert_successfully=*/nullptr,
+        /*profile=*/nullptr));
     TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(engine->serialize());
     segment_string = string(static_cast<const char*>(engine_data->data()),
                             engine_data->size());
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 4fe040019ea..10805da2f06 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 #include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -1334,9 +1335,10 @@ Status Converter::RenameAndMarkOutputTensors(
 }
 
 Status Converter::BuildCudaEngine(
-    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, int max_batch_size,
-    size_t max_workspace_size_bytes, nvinfer1::IGpuAllocator* allocator,
-    TRTInt8Calibrator* calibrator) {
+    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
+    int max_batch_size, size_t max_workspace_size_bytes,
+    nvinfer1::IGpuAllocator* allocator, TRTInt8Calibrator* calibrator,
+    TrtShapeOptimizationProfile* profiles) {
   VLOG(1) << "Configuring TensorRT builder";
   trt_builder_->setMaxBatchSize(max_batch_size);
   trt_builder_->setGpuAllocator(allocator);
@@ -1356,7 +1358,10 @@ Status Converter::BuildCudaEngine(
       builder_config->setInt8Calibrator(nullptr);
     }
   }
-
+  if (!use_implicit_batch_ && profiles) {
+    profiles->ConfigureBuilder(trt_builder_.get(), builder_config.get(),
+                              network());
+  }
   VLOG(1) << "Building TensorRT engine";
   engine->reset(
       trt_builder_->buildEngineWithConfig(*network(), *builder_config));
@@ -5734,7 +5739,8 @@ Status ConvertGraphDefToEngine(
     nvinfer1::ILogger* trt_logger, nvinfer1::IGpuAllocator* allocator,
     TRTInt8Calibrator* calibrator,
     TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, bool use_calibration,
-    const bool use_implicit_batch, bool* convert_successfully) {
+    const bool use_implicit_batch, bool* convert_successfully,
+    TrtShapeOptimizationProfile* profiles) {
   engine->reset();
   if (convert_successfully) *convert_successfully = false;
 
@@ -5833,7 +5839,8 @@ Status ConvertGraphDefToEngine(
 
   // Build the engine.
   TF_RETURN_IF_ERROR(converter->BuildCudaEngine(
-      engine, max_batch_size, max_workspace_size_bytes, allocator, calibrator));
+      engine, max_batch_size, max_workspace_size_bytes, allocator, calibrator,
+      profiles));
 
   VLOG(1) << "Finished conversion";
   return Status::OK();
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index d295f074a98..3f65b1a9818 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
@@ -145,7 +146,8 @@ Status ConvertGraphDefToEngine(
     nvinfer1::ILogger* logger, nvinfer1::IGpuAllocator* allocator,
     TRTInt8Calibrator* calibrator,
     TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, bool use_calibration,
-    const bool use_implicit_batch, bool* convert_successfully);
+    const bool use_implicit_batch, bool* convert_successfully,
+    TrtShapeOptimizationProfile* profiles);
 
 // Helper class for the segmenter to determine whether an output edge from the
 // TRT segment is valid.
@@ -465,7 +467,8 @@ class Converter {
   Status BuildCudaEngine(TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
                          int max_batch_size, size_t max_workspace_size_bytes,
                          nvinfer1::IGpuAllocator* allocator,
-                         TRTInt8Calibrator* calibrator);
+                         TRTInt8Calibrator* calibrator,
+                         TrtShapeOptimizationProfile* profiles);
 
   //////////////////////////////////////////////////////////////////////////////
   // Methods used by op converters to convert individual TF node and add layers
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 98aaa18e9fc..400c53614f9 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -1187,7 +1187,7 @@ class ConvertGraphDefToEngineTest : public ::testing::Test {
         /*max_workspace_size_bytes=*/64 << 20, input_shapes, &logger_,
         /*allocator=*/nullptr, /*calibrator=*/nullptr, &engine_,
         /*use_calibration=*/false, /*use_implicit_batch=*/true,
-        /*convert_successfully=*/nullptr);
+        /*convert_successfully=*/nullptr, /*profiles=*/nullptr);
   }
 
  protected:
@@ -1302,7 +1302,8 @@ class OpConverterTest : public ::testing::Test {
                                     /*max_batch_size=*/batch_size,
                                     /*max_workspace_size_bytes=*/1 << 26,
                                     /*allocator=*/nullptr,
-                                    /*calibrator=*/nullptr));
+                                    /*calibrator=*/nullptr,
+                                    /*profiles=*/nullptr));
     CHECK_NOTNULL(engine_.get());
     CheckDataTypeMatches(input_data);
     CheckDataTypeMatches(*output_data);
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index 70ec4fc0665..e39176bdf85 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/framework/function.h"
@@ -92,7 +93,7 @@ class TRTEngineOp : public AsyncOpKernel {
       LRUCache<std::vector<TensorShape>, std::unique_ptr<EngineContext>,
                VectorTensorShapeHasher>;
 
-  // Execute calibration
+  // Execute calibration.
   void ExecuteCalibration(OpKernelContext* ctx,
                           TRTEngineCacheResource* cache_res,
                           AsyncHelper* helper);
@@ -108,9 +109,10 @@ class TRTEngineOp : public AsyncOpKernel {
 
   // Execute the tensorrt engine. Returns whether we need to retry by running
   // the native segment.
-  bool ExecuteTrtEngine(OpKernelContext* ctx, EngineContext* engine_context);
+  bool ExecuteTrtEngine(OpKernelContext* ctx, EngineContext* engine_context,
+                        int trt_context_idx);
 
-  // Allocate necessary resources for calibration
+  // Allocate necessary resources for calibration.
   Status AllocateCalibrationResources(OpKernelContext* ctx,
                                       TRTEngineCacheResource* cache_res);
 
@@ -594,11 +596,24 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
 
   OP_REQUIRES_OK_ASYNC(ctx, VerifyInputShapes(input_concrete_shapes), *helper);
 
+  if (!use_implicit_batch_) {
+    if (cache_res->profiles_.GetNumProfiles() == 0) {
+      // Create a single profile from the current input shape.
+      // In the future we will collect a set of input shapes during build mode
+      // and create profiles for each of them.
+      cache_res->profiles_.AddShape(input_concrete_shapes);
+      cache_res->profiles_.InitProfiles();
+    }
+  }
   StatusOr<EngineContext*> status =
       GetEngine(input_concrete_shapes, ctx, cache_res);
   OP_REQUIRES_OK_ASYNC(ctx, status.status(), *helper);
 
   EngineContext* engine_context = status.ValueOrDie();
+  // Context idx equals with the profile idx because for each profile we create
+  // one context. Currently we do not have profile_generation mode, therefore we
+  // have just a single profile.
+  int trt_context_idx = 0;
   if (!engine_context->cuda_engine) {
     VLOG(1) << "Engine retrieval for input shapes: "
             << TensorShapeUtils::ShapeListString(input_concrete_shapes)
@@ -606,7 +621,8 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
     ExecuteNativeSegment(ctx, helper);
     return;
   }
-  const bool retry = ExecuteTrtEngine(ctx, engine_context);
+
+  const bool retry = ExecuteTrtEngine(ctx, engine_context, trt_context_idx);
   if (retry) {
     LOG(WARNING) << "Failed to execute engine, "
                  << "retrying with native segment for " << name();
@@ -654,7 +670,8 @@ Status GetTrtBindingIndex(const char* tensor_name, int profile_index,
 }
 
 bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
-                                   EngineContext* engine_context) {
+                                   EngineContext* engine_context,
+                                   int trt_context_idx) {
   VLOG(1) << "Executing TRT engine: " << name();
   auto& cuda_engine = engine_context->cuda_engine;
 
@@ -677,6 +694,11 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
   }
 
   const bool kRetry = true;
+  if (trt_context_idx >= 1) {
+    LOG(ERROR) << "Requested engine context with index " << trt_context_idx
+               << ", but only 1 context is present.";
+    return kRetry;
+  }
   auto& execution_context = engine_context->execution_context;
   const int num_binding = cuda_engine->getNbBindings();
   std::vector<void*> buffers(num_binding);
@@ -685,8 +707,8 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
   for (int i = 0; i < ctx->num_inputs(); i++) {
     const string input_name = StrCat(IONamePrefixes::kInputPHName, i);
     int binding_index;
-    auto status = GetTrtBindingIndex(input_name.c_str(), 0, cuda_engine.get(),
-                                     &binding_index);
+    auto status = GetTrtBindingIndex(input_name.c_str(), trt_context_idx,
+                                     cuda_engine.get(), &binding_index);
     if (!status.ok()) {
       ctx->SetStatus(status);
       return !kRetry;
@@ -757,8 +779,8 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
   for (int i = 0; i < ctx->num_outputs(); i++) {
     const string output_name = StrCat(IONamePrefixes::kOutputPHName, i);
     int binding_index;
-    auto status = GetTrtBindingIndex(output_name.c_str(), 0, cuda_engine.get(),
-                                     &binding_index);
+    auto status = GetTrtBindingIndex(output_name.c_str(), trt_context_idx,
+                                     cuda_engine.get(), &binding_index);
     if (!status.ok()) {
       ctx->SetStatus(status);
       return !kRetry;
@@ -788,7 +810,7 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
         trt_shape.push_back(dims.d[j]);
       }
     }
-    // Allocate output tensor of TRTEngineOp
+    // Allocate output tensor of TRTEngineOp.
     Tensor* output_tensor = nullptr;
     TensorShape output_shape;
     status = TensorShapeUtils::MakeShape(trt_shape.data(), trt_shape.size(),
@@ -975,7 +997,8 @@ StatusOr<EngineContext*> TRTEngineOp::GetEngine(
     auto status = convert::ConvertGraphDefToEngine(
         segment_graph_def_, precision_mode_, batch_size, workspace_size_,
         conversion_input_shapes, &logger, allocator, calibrator_.get(), &engine,
-        use_calibration_, use_implicit_batch_, &convert_successfully);
+        use_calibration_, use_implicit_batch_, &convert_successfully,
+        &cache_res->profiles_);
     if (!status.ok()) {
       LOG(WARNING) << "Engine creation for " << name() << " failed. "
                    << "The native segment will be used instead. "
@@ -985,11 +1008,11 @@ StatusOr<EngineContext*> TRTEngineOp::GetEngine(
       cache.emplace(input_concrete_shapes, absl::make_unique<EngineContext>());
       return &empty_context;
     }
-    TrtUniquePtrType<nvinfer1::IExecutionContext> exec_context(
-        engine->createExecutionContext());
+    std::vector<TrtUniquePtrType<nvinfer1::IExecutionContext>> exec_context;
+    cache_res->profiles_.CreateExecutionContexts(engine.get(), exec_context);
     cache.emplace(input_concrete_shapes,
                   absl::make_unique<EngineContext>(std::move(engine),
-                                                   std::move(exec_context)));
+                                                   std::move(exec_context[0])));
     VLOG(1) << "Added new engine to cache of " << name()
             << ". Cache size: " << cache.size();
   }
@@ -1063,9 +1086,9 @@ Status TRTEngineOp::AllocateCalibrationResources(
         this->segment_graph_def_, TrtPrecisionMode::INT8,
         cres->calibrator_->getBatchSize(), this->workspace_size_,
         partial_shapes, &cache_res->GetLogger(), cache_res->allocator_.get(),
-        cres->calibrator_.get(), &cres->engine_,
-        /*use_calibration=*/true, this->use_implicit_batch_,
-        /*convert_successfully=*/nullptr);
+        cres->calibrator_.get(), &cres->engine_, /*use_calibration=*/true,
+        this->use_implicit_batch_, /*convert_successfully=*/nullptr,
+        /*profiles=*/nullptr);
     if (!s.ok()) {
       LOG(ERROR) << "Calibration failed: " << s;
       cres->calibrator_->setDone();  // Ignore further pushes
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
index 808b689127e..c652d364485 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
@@ -182,6 +183,11 @@ class TRTEngineCacheResource : public ResourceBase {
   // TODO(hinsu): Use different calibration context for the available shapes and
   // attach it to each item of the cache.
   std::unique_ptr<CalibrationContext> calib_ctx_;
+
+  // This object maintains all the optimization profiles during profile generation
+  // and engine build. We currently don't use this object during runtime, instead
+  // we deserialize the profiles out of the cached engines.
+  TrtShapeOptimizationProfile profiles_;
 };
 
 #endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
new file mode 100644
index 00000000000..6d159b86d08
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
@@ -0,0 +1,177 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h"
+#include <algorithm>
+#include <functional>
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+// Create optimization profiles for a list of input shapes. The list of input
+// shapes are stored in shapes_.
+void TrtShapeOptimizationProfile::InitProfiles() {
+  if (input_shapes_.size() == 0) {
+    VLOG(1) << "Not creating profiles without input_shapes. "
+               "You have to enable profile generation mode first (build).";
+  } else {
+    VLOG(1) << "Creating profiles with startegy of one profile "
+            << "for each input (min=opt=max).";
+  }
+  for (auto& shape_vec : input_shapes_) {
+    std::vector<nvinfer1::Dims> dimvec;
+    for (auto& shape : shape_vec) {
+      dimvec.push_back(TensorShapeToTrtDims(shape, false));
+    }
+    // We set min=opt=max.
+    OptimizationProfileConfig profConfig{dimvec, dimvec, dimvec};
+    profiles_.push_back(std::move(profConfig));
+    VLOG(1) << "Created profile " << profiles_.back().DebugString();
+  }
+}
+
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
+Status TrtShapeOptimizationProfile::AddProfiles(
+    nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
+    const nvinfer1::INetworkDefinition* network) {
+  // Create a vector of optimization profiles
+  for (int i = 0; i < profiles_.size(); i++) {
+    auto* optProfile = builder->createOptimizationProfile();
+    Status status = profiles_[i].SetDimensions(network, optProfile);
+    if (!status.ok()) {
+      return status;
+    }
+    int idx = -1;
+    if (optProfile->isValid()) {
+      idx = config->addOptimizationProfile(optProfile);
+    }
+    if (idx >= 0) {
+      if (i != idx) {
+        return errors::Internal(
+            "Profile index of engine config is different from resource profile "
+            "index: ",
+            i, " != ", idx);
+      }
+      VLOG(1) << "Added optimization profile " << profiles_[i].DebugString()
+              << " to builder config.";
+    } else {
+      VLOG(ERROR) << "Failed to add optimization profile "
+                  << profiles_[i].DebugString()
+                  << ". This usually happens when profile is invalid.";
+    }
+  }
+  if (config->getNbOptimizationProfiles() == 0) {
+    return errors::Internal("Failure in adding an optimization profile.");
+  }
+  // if TRT_VERSION < 6, then we do not need to add
+  return Status::OK();
+}
+#endif
+
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
+Status TrtShapeOptimizationProfile::ConfigureBuilder(
+    nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
+    const nvinfer1::INetworkDefinition* network) {
+  AddProfiles(builder, config, network);
+  return Status::OK();
+}
+#endif
+
+int TrtShapeOptimizationProfile::GetProfileNumber(
+    std::vector<TensorShape> shapes) {
+  for (int i = 0; i < profiles_.size(); i++) {
+    if (profiles_[i].IncludesShapes(shapes)) {
+      return i;
+    }
+  }
+  VLOG(1) << "Profile not found for input shapes " << DebugString(shapes)
+          << ".";
+  return -1;
+}
+
+Status TrtShapeOptimizationProfile::CreateExecutionContexts(
+    nvinfer1::ICudaEngine* engine,
+    std::vector<TrtUniquePtrType<nvinfer1::IExecutionContext>>& exec_context) {
+  int i = 0;
+  // The following loops runs once if we have static shapes, to create a single
+  // execution context without profiles.
+  // In dynamic mode we create one context for each profile and set the
+  // corresponding optimization profile.
+  do {
+    VLOG(1) << "Creating execution context " << i;
+    nvinfer1::IExecutionContext* ctx = engine->createExecutionContext();
+    if (ctx == nullptr) {
+      return errors::Internal("Failed to create execution context");
+    }
+    if (i > 0) {
+      // This condition is needed for two reasons:
+      // - using static shapes we do not have any profiles so we cannot call
+      //   set optimizationprofiles.
+      // - The 0th profile is set implicitly for the first execution context
+      //   therefore we do not need to set.
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
+      bool stat = ctx->setOptimizationProfile(i);
+      if (!stat) {
+        ctx->destroy();
+        return errors::Internal("Could not set TRT optimization profile.");
+      }
+#endif
+    }
+    exec_context.push_back(
+        std::move(TrtUniquePtrType<nvinfer1::IExecutionContext>(ctx)));
+    i++;
+  } while (i < profiles_.size());
+
+  return Status::OK();
+}
+
+Status TrtShapeOptimizationProfile::RestoreProfiles(
+    const nvinfer1::ICudaEngine* engine) {
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
+  if (!engine || engine->hasImplicitBatchDimension()) {
+    // Nothing to do, we cannot have profiles in implicit batch mode
+    return Status::OK();
+  }
+  int n_profiles = engine->getNbOptimizationProfiles();
+  int n_inputs = GetNumberOfEngineInputs(engine);
+  VLOG(2) << "Attempting to restore " << n_profiles << " profiles, each with "
+          << n_inputs << " inputs";
+  for (int prof_idx = 0; prof_idx < n_profiles; prof_idx++) {
+    OptimizationProfileConfig cfg;
+    for (int j = 0; j < n_inputs; j++) {
+      nvinfer1::Dims min = engine->getProfileDimensions(
+          j, prof_idx, nvinfer1::OptProfileSelector::kMIN);
+      nvinfer1::Dims max = engine->getProfileDimensions(
+          j, prof_idx, nvinfer1::OptProfileSelector::kMAX);
+      nvinfer1::Dims opt = engine->getProfileDimensions(
+          j, prof_idx, nvinfer1::OptProfileSelector::kOPT);
+      cfg.min.push_back(min);
+      cfg.max.push_back(max);
+      cfg.opt.push_back(opt);
+    }
+    VLOG(2) << "Restored profile " << cfg.DebugString();
+    profiles_.push_back(std::move(cfg));
+  }
+#endif
+  return Status::OK();
+}
+
+int TrtShapeOptimizationProfile::GetNumProfiles() const {
+  return profiles_.size();
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
new file mode 100644
index 00000000000..a4b98570db8
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
@@ -0,0 +1,179 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_
+
+#include <list>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+// Stores optimization profile parameters (min/opt/max of each input shape)
+//
+// A TensorRT optimization profile describes the possible min/max values of
+// each dynamic input shape along with an optimum value. These values are used
+// by the TensorRT builder to select the best kernel for the optimum value among
+// those kernels that are valid for all input tensors in the [min, max] range.
+struct OptimizationProfileConfig {
+  // Length of vector == num_inputs to engine
+  std::vector<nvinfer1::Dims> min;
+  std::vector<nvinfer1::Dims> opt;
+  std::vector<nvinfer1::Dims> max;
+
+  string DebugString() const {
+    using absl::StrCat;
+    return StrCat("[min: ", tensorflow::tensorrt::DebugString(min),
+                  ", opt: : ", tensorflow::tensorrt::DebugString(opt),
+                  ", max: ", tensorflow::tensorrt::DebugString(max), "]");
+  }
+
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
+  // Set the stored min/opt/max dimensions for profile.
+  //
+  // Parameters:
+  // network - TensorRT network, used to enumerate all the input tensors
+  // profile - on exit the profile information will be set for each input tensor
+  Status SetDimensions(const nvinfer1::INetworkDefinition* network,
+                       nvinfer1::IOptimizationProfile* profile) const {
+    int n_inputs = network->getNbInputs();
+    if (min.size() != n_inputs || opt.size() != n_inputs ||
+        max.size() != n_inputs) {
+      return errors::Internal("Incorrect number of profile config parameters");
+    }
+    for (int i = 0; i < n_inputs; i++) {
+      const char* name = network->getInput(i)->getName();
+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kMIN, min[i]);
+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kOPT, opt[i]);
+      profile->setDimensions(name, nvinfer1::OptProfileSelector::kMAX, max[i]);
+    }
+    return Status::OK();
+  }
+#endif
+
+  // Returns true if profile range completely includes the given shapes.
+  bool IncludesShapes(const std::vector<TensorShape>& shapes) const {
+    // min, max, and opt must have the same size which,
+    // already verified in SetDimensions.
+    if (min.size() != shapes.size()) {
+      return false;
+    }
+    for (int i = 0; i < shapes.size(); i++) {
+      auto current_shape = shapes[i];
+      // min, max, and opt must have the same nbDims, which is
+      // already verified in SetDimensions.
+      if (min[i].nbDims != current_shape.dims()) {
+        return false;
+      }
+      // Check if range [min, max] includes current_shape.
+      for (int dim = 0; dim < current_shape.dims(); dim++) {
+        if ((min[i].d[dim] > current_shape.dim_size(dim)) ||
+            (max[i].d[dim] < current_shape.dim_size(dim))) {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+};
+
+// Manages Optimization profiles during TRT Engine construction.
+//
+// An optimization profile describes a range of dimensions for each TRT network
+// input, and the optimal dimensions that the auto-tuner should use for
+// optimization.
+//
+// This class stores the list of input shapes that were seen during the
+// build/profile_generation_mode phase, and using them it creates a set
+// of OptimizationProfileConfigs. These configs will be added to
+// IBuilderConfig before the engine is created.
+//
+class TrtShapeOptimizationProfile {
+ public:
+  TrtShapeOptimizationProfile(){};
+
+  // Stores input shape information during profile_generation_mode
+  void AddShape(std::vector<TensorShape> shapes) {
+    input_shapes_.insert(shapes);
+    VLOG(1) << "Collected shape(s) " << DebugString(shapes) << " for profiles.";
+  }
+
+  void clear() { profiles_.clear(); }
+
+  // Returns the profile number that should be used to execute the network with
+  // the given input shapes. Returns -1 if none of cached profiles are
+  // compatible with the given input shapes.
+  int GetProfileNumber(std::vector<TensorShape> shapes);
+
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
+  // Creates optimization profiles and add them to the builder config.
+  Status ConfigureBuilder(nvinfer1::IBuilder* builder,
+                          nvinfer1::IBuilderConfig* config,
+                          const nvinfer1::INetworkDefinition* network);
+#endif
+
+  // Creates execution contexts for each optimization profile.
+  Status CreateExecutionContexts(
+      nvinfer1::ICudaEngine* engine,
+      std::vector<TrtUniquePtrType<nvinfer1::IExecutionContext>>& exec_context);
+
+  /// Map input vector shapes to TRT Optimization profiles (min, max, opt)
+  // i.e. maps input_shapes_ to profiles_
+  void InitProfiles();
+
+  // Returns number of created profiles.
+  int GetNumProfiles() const;
+
+  // Restore profiles from the engine (used after deserialization)
+  Status RestoreProfiles(const nvinfer1::ICudaEngine* engine);
+
+ private:
+  // Set of input shape vetors that we collect during profile_generation_mode
+  std::unordered_set<std::vector<TensorShape>, VectorTensorShapeHasher>
+      input_shapes_;
+
+  // The optimization profiles generated from input_shapes_
+  std::vector<OptimizationProfileConfig> profiles_;
+
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
+  /// Add optimization profiles to the builder config
+  Status AddProfiles(nvinfer1::IBuilder* builder,
+                     nvinfer1::IBuilderConfig* config,
+                     const nvinfer1::INetworkDefinition* network);
+#endif
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
new file mode 100644
index 00000000000..0fe96afc713
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
@@ -0,0 +1,214 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string.h>
+
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/test.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+#include "third_party/tensorrt/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+std::vector<TensorShape> dimvec2shapevec(std::vector<nvinfer1::Dims3> dimvec) {
+  std::vector<TensorShape> shapevec(dimvec.size());
+  for (int i = 0; i < dimvec.size(); i++) {
+    TensorShape shape;
+    TensorShapeUtils::MakeShape(dimvec[i].d, dimvec[i].nbDims, &shape);
+    shapevec[i] = shape;
+  }
+  return shapevec;
+}
+
+bool dimsContained(const nvinfer1::Dims& dim, const nvinfer1::Dims& min,
+                   const nvinfer1::Dims& max) {
+  if (dim.nbDims != min.nbDims || dim.nbDims != max.nbDims) {
+    return false;
+  }
+  for (int i = 0; i < dim.nbDims; i++) {
+    if (dim.d[i] < min.d[i] || dim.d[i] > max.d[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool dimsEqual(const nvinfer1::Dims& a, const nvinfer1::Dims& b) {
+  if (a.nbDims != b.nbDims) {
+    return false;
+  }
+  for (int i = 0; i < a.nbDims; i++) {
+    if (a.d[i] != b.d[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+class TrtShapeOptimizationProfileTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    builder_ = TrtUniquePtrType<nvinfer1::IBuilder>(
+        nvinfer1::createInferBuilder(logger_));
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
+    network_ = TrtUniquePtrType<nvinfer1::INetworkDefinition>(
+        builder_->createNetworkV2(flags_));
+    builder_config_ = TrtUniquePtrType<nvinfer1::IBuilderConfig>(
+        builder_->createBuilderConfig());
+    builder_config_->setMaxWorkspaceSize(1 << 10);
+#else
+    network_ = TrtUniquePtrType<nvinfer1::INetworkDefinition>(
+        builder_->createNetwork());
+    builder_->setMaxWorkspaceSize(1 << 10);
+#endif
+  }
+
+  // define a simple network: output = input1 + input2
+  void DefineNetwork(nvinfer1::INetworkDefinition* network,
+                     nvinfer1::Dims3& dims) {
+    nvinfer1::ITensor* input1 =
+        network->addInput("input1", nvinfer1::DataType::kFLOAT, dims);
+    EXPECT_NE(nullptr, input1);
+
+    nvinfer1::ITensor* input2 =
+        network->addInput("input2", nvinfer1::DataType::kFLOAT, dims);
+    EXPECT_NE(nullptr, input1);
+
+    auto layer = network->addElementWise(*input1, *input2,
+                                         nvinfer1::ElementWiseOperation::kSUM);
+    EXPECT_NE(nullptr, layer);
+    // Mark the output.
+    nvinfer1::ITensor* output = layer->getOutput(0);
+    output->setName("output");
+    network->markOutput(*output);
+  }
+
+  Logger logger_;
+  TrtUniquePtrType<nvinfer1::IBuilder> builder_;
+  TrtUniquePtrType<nvinfer1::INetworkDefinition> network_;
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
+  TrtUniquePtrType<nvinfer1::IBuilderConfig> builder_config_;
+#endif
+  TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
+  std::vector<TrtUniquePtrType<nvinfer1::IExecutionContext>> exec_context_;
+  // The order is important: exec_context_ must be destroyed first, and logger
+  // at last.
+
+  const uint32_t flags_ =
+      1U << static_cast<int>(
+          nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+};
+
+TEST_F(TrtShapeOptimizationProfileTest, Static) {
+  // Network with static input shape
+  nvinfer1::Dims3 dims(8, 8, 10);
+  DefineNetwork(network_.get(), dims);
+
+  TrtShapeOptimizationProfile profile;
+
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
+  // Configure and build engine - should be a no-op
+  profile.ConfigureBuilder(builder_.get(), builder_config_.get(),
+                           network_.get());
+
+  engine = TrtUniquePtrType<nvinfer1::ICudaEngine>(
+      builder_->buildEngineWithConfig(*network_, *builder_config_));
+#else
+  engine = TrtUniquePtrType<nvinfer1::ICudaEngine>(
+      builder_->buildCudaEngine(*network_));
+#endif
+  EXPECT_NE(nullptr, engine);
+  profile.CreateExecutionContexts(engine.get(), exec_context_);
+  // A single execution context should be created for a graph with static input
+  ASSERT_EQ(exec_context_.size(), 1);
+  EXPECT_NE(nullptr, exec_context_[0]);
+
+  std::vector<nvinfer1::Dims3> dim_vec(2, dims);
+  std::vector<TensorShape> shape_vec = dimvec2shapevec(dim_vec);
+  EXPECT_EQ(-1, profile.GetProfileNumber(shape_vec));
+}
+
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
+TEST_F(TrtShapeOptimizationProfileTest, Dynamic) {
+  // Network with dynamic input shapes
+  nvinfer1::Dims3 dims(-1, -1, 10);
+  DefineNetwork(network_.get(), dims);
+
+  TrtShapeOptimizationProfile profile;
+  std::vector<std::vector<nvinfer1::Dims3>> input_profiles{
+      {nvinfer1::Dims3(2, 2, 10), nvinfer1::Dims3(2, 2, 10)},
+      {nvinfer1::Dims3(3, 3, 10), nvinfer1::Dims3(3, 3, 10)},
+      {nvinfer1::Dims3(16, 16, 10), nvinfer1::Dims3(16, 16, 10)},
+  };
+
+  // Simulate a profile collection phase
+  for (auto dim_vec : input_profiles) {
+    std::vector<TensorShape> shape_vec = dimvec2shapevec(dim_vec);
+    profile.AddShape(shape_vec);
+  }
+  profile.InitProfiles();
+
+  // Configure and build engine
+  profile.ConfigureBuilder(builder_.get(), builder_config_.get(),
+                           network_.get());
+  engine = TrtUniquePtrType<nvinfer1::ICudaEngine>(
+      builder_->buildEngineWithConfig(*network_.get(), *builder_config_.get()));
+  ASSERT_NE(nullptr, engine);
+
+  profile.CreateExecutionContexts(engine.get(), exec_context_);
+
+  // Each profile has an associated execution context
+  // This test depends on the profile creation strategy:
+  // e.g. if we would introduce a default context, then the sizes will not match
+  EXPECT_EQ(exec_context_.size(), input_profiles.size());
+
+  // Check if the profiles are assigned correctly
+  for (auto dimvec : input_profiles) {
+    std::vector<TensorShape> shape_vec = dimvec2shapevec(dimvec);
+    int idx = profile.GetProfileNumber(shape_vec);
+    int prof_idx = exec_context_[idx]->getOptimizationProfile();
+    ASSERT_GE(prof_idx, 0);
+
+    for (int j = 0; j < dimvec.size(); j++) {
+      nvinfer1::Dims min = engine->getProfileDimensions(
+          j, prof_idx, nvinfer1::OptProfileSelector::kMIN);
+      nvinfer1::Dims max = engine->getProfileDimensions(
+          j, prof_idx, nvinfer1::OptProfileSelector::kMAX);
+      nvinfer1::Dims opt = engine->getProfileDimensions(
+          j, prof_idx, nvinfer1::OptProfileSelector::kOPT);
+
+      EXPECT_TRUE(dimsContained(dimvec[j], min, max));
+      EXPECT_TRUE(dimsEqual(dimvec[j], opt));
+    }
+  }
+}
+#endif
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA

From 7ed30210f2355e6d2d1fe22c6525c697fddad869 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Sun, 2 Feb 2020 10:41:09 +0100
Subject: [PATCH 065/442] Restore profiles and ExecutionContexts after
 deserialization

---
 .../kernels/trt_engine_resource_ops.cc          | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
index 891b75be824..fcf39962e3b 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
@@ -140,11 +140,24 @@ class InitializeTRTResource : public OpKernel {
               engine_instance.serialized_engine().c_str(),
               engine_instance.serialized_engine().size(), nullptr));
       auto raw_engine = engine.get();
+      std::vector<TrtUniquePtrType<nvinfer1::IExecutionContext>> ctx_vec;
+      if (num_loaded_engine == 0) {
+        // Restore profiles if there are any. Currently only 1 engine is allowed
+        // in dynamic mode therefore we call this only for the 0th engine.
+        // it is a no-op in implicit batch mode.
+        resource->profiles_.RestoreProfiles(raw_engine);
+        resource->profiles_.CreateExecutionContexts(raw_engine, ctx_vec);
+      } else {
+        // Multiple engines are only available in static mode. For each engine
+        // we have only a single execution context.
+        TrtUniquePtrType<nvinfer1::IExecutionContext> exec_ctx(
+            raw_engine->createExecutionContext());
+        ctx_vec.push_back(std::move(exec_ctx));
+      }
       resource->cache_.emplace(
           engine_input_shapes,
           absl::make_unique<EngineContext>(
-              std::move(engine), TrtUniquePtrType<nvinfer1::IExecutionContext>(
-                                     raw_engine->createExecutionContext())));
+              std::move(engine), std::move(ctx_vec[0])));
       ++num_loaded_engine;
     } while (1);
     VLOG(1) << "Loaded " << num_loaded_engine << " TRT engines for op "

From a5ac44a0da3fb5e325195577149f27a4dae9ae4a Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Mon, 10 Feb 2020 10:02:54 +0100
Subject: [PATCH 066/442] Add GetNumberOfEngineInputs function

---
 tensorflow/compiler/tf2tensorrt/convert/utils.cc | 16 ++++++++++++++++
 tensorflow/compiler/tf2tensorrt/convert/utils.h  |  5 +++++
 2 files changed, 21 insertions(+)

diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.cc b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
index ae6555d2219..efc5e73990d 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
@@ -165,5 +165,21 @@ string GetLoadedTensorRTVersion() {
   return absl::StrCat(major, ".", minor, ".", patch);
 }
 
+int GetNumberOfEngineInputs(
+  const nvinfer1::ICudaEngine *engine) {
+  int n_bindings = engine->getNbBindings();
+  int n_input = 0;
+  for (int i=0; i < n_bindings; i++) {
+     if (engine->bindingIsInput(i)) n_input++;
+  }
+  // According to TensorRT 7 doc: "If the engine has been built for K profiles,
+  // the first getNbBindings() / K bindings are used by profile number 0, the
+  // following getNbBindings() / K bindings are used by profile number 1 etc."
+  // Therefore, to get the number of input tensors, we need to divide by the
+  // the number of profiles.
+  int n_profiles = engine->getNbOptimizationProfiles();
+  return n_input / n_profiles;
+}
+
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.h b/tensorflow/compiler/tf2tensorrt/convert/utils.h
index 97dcf8976f4..bda01108341 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.h
@@ -106,6 +106,11 @@ string GetLinkedTensorRTVersion();
 // TensorRT library version information {Maj, Min, Patch}.
 string GetLoadedTensorRTVersion();
 
+// Returns the number of inputs for the engine, which also correspends to the
+// number of input tensors for the network. This can differ from the number of
+// input bindings, because each profile has a set of bindings.
+int GetNumberOfEngineInputs(const nvinfer1::ICudaEngine *engine);
+
 #endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 }  // namespace tensorrt

From 449d2e04914cbaf17a9dc4b9502eff93b3622246 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Mon, 10 Feb 2020 17:25:56 +0100
Subject: [PATCH 067/442] Update TRT dynamic shape tests

---
 .../tf2tensorrt/kernels/trt_engine_op_test.cc     | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
index 784d230b0b6..f661a9ecc07 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
@@ -215,16 +215,7 @@ TEST_F(TRTEngineOpTestBase, DynamicShapes) {
   TensorShape input_shape({1, 2});
   TRTEngineOpTestBase::AddSimpleInput<float>(input_shape);
 
-  // We expect that TensorRT engine creation fails: we would need to configure
-  // the engine with optimization profiles to use dynamic input shapes, but that
-  // feature is not yet implemented.
-  //
-  // Since TRT engine creation has failed, we fall back to native segment.
-  // Calling the native segment fails for the same reason that is investigated
-  // in https://github.com/tensorflow/tensorflow/pull/34919. This is irrelevant
-  // for the current test, here we want to just check wether TRT engine creation
-  // has failed.
-  OpsTestBase::RunOpKernel();
+  TF_ASSERT_OK(OpsTestBase::RunOpKernel());
 
   // Get the engine cache.
   TRTEngineCacheResource* cache_resource = nullptr;
@@ -237,9 +228,7 @@ TEST_F(TRTEngineOpTestBase, DynamicShapes) {
   EXPECT_EQ(1, cache->size());
   ASSERT_EQ(1, cache->count({input_shape}));
   EngineContext* ectx = cache->at({input_shape}).get();
-  // Since engine creation failed, we expect to find nullptr. Finding a nullptr
-  // indicates that unknown shapes were used to define the TensorRT network.
-  EXPECT_EQ(ectx->cuda_engine, nullptr);
+  EXPECT_NE(ectx->cuda_engine, nullptr);
 }
 
 template <typename T>

From 1ab228164fcbc648a8b885e2d05ccbf68375758f Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Tue, 11 Feb 2020 12:34:55 +0100
Subject: [PATCH 068/442] Improve comments and style

---
 tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc | 2 +-
 .../tf2tensorrt/utils/trt_shape_optimization_profiles.cc | 7 +++----
 .../tf2tensorrt/utils/trt_shape_optimization_profiles.h  | 9 ++++-----
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 10805da2f06..e9c587c60e0 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -1360,7 +1360,7 @@ Status Converter::BuildCudaEngine(
   }
   if (!use_implicit_batch_ && profiles) {
     profiles->ConfigureBuilder(trt_builder_.get(), builder_config.get(),
-                              network());
+                               network());
   }
   VLOG(1) << "Building TensorRT engine";
   engine->reset(
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
index 6d159b86d08..1646f3027f9 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
@@ -106,10 +106,9 @@ Status TrtShapeOptimizationProfile::CreateExecutionContexts(
     nvinfer1::ICudaEngine* engine,
     std::vector<TrtUniquePtrType<nvinfer1::IExecutionContext>>& exec_context) {
   int i = 0;
-  // The following loops runs once if we have static shapes, to create a single
-  // execution context without profiles.
-  // In dynamic mode we create one context for each profile and set the
-  // corresponding optimization profile.
+  // The following loop runs once if we have static shapes, to create a single
+  // execution context without profiles. In dynamic mode we create one context
+  // for each profile and set the corresponding optimization profile.
   do {
     VLOG(1) << "Creating execution context " << i;
     nvinfer1::IExecutionContext* ctx = engine->createExecutionContext();
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
index a4b98570db8..b445c4b4742 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
@@ -37,7 +37,7 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 
-// Stores optimization profile parameters (min/opt/max of each input shape)
+// Stores optimization profile parameters (min/opt/max of each input shape).
 //
 // A TensorRT optimization profile describes the possible min/max values of
 // each dynamic input shape along with an optimum value. These values are used
@@ -112,10 +112,9 @@ struct OptimizationProfileConfig {
 // optimization.
 //
 // This class stores the list of input shapes that were seen during the
-// build/profile_generation_mode phase, and using them it creates a set
-// of OptimizationProfileConfigs. These configs will be added to
-// IBuilderConfig before the engine is created.
-//
+// build/profile_generation_mode phase, and using them it creates a set of
+// OptimizationProfileConfigs. These configs will be added to IBuilderConfig
+// before the engine is created.
 class TrtShapeOptimizationProfile {
  public:
   TrtShapeOptimizationProfile(){};

From cd0f46c49afa9b7f5a212eafd6616cc7ab33e50b Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Tue, 11 Feb 2020 14:39:37 +0100
Subject: [PATCH 069/442] Safeguard TRT6 usage

---
 tensorflow/compiler/tf2tensorrt/convert/utils.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.cc b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
index efc5e73990d..4fe51047caf 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
@@ -177,7 +177,11 @@ int GetNumberOfEngineInputs(
   // following getNbBindings() / K bindings are used by profile number 1 etc."
   // Therefore, to get the number of input tensors, we need to divide by the
   // the number of profiles.
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
   int n_profiles = engine->getNbOptimizationProfiles();
+#else
+  int n_profiles = 1;
+#endif
   return n_input / n_profiles;
 }
 

From bc05c61c9685e2d9c8e3d932be56932718339797 Mon Sep 17 00:00:00 2001
From: Puneeth K <puneethk.cs17@bmsce.ac.in>
Date: Tue, 11 Feb 2020 19:52:08 +0530
Subject: [PATCH 070/442] Updated util.py

---
 tensorflow/python/util/nest.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index c27cb8bc2f8..3298766b686 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -50,6 +50,7 @@ import wrapt as _wrapt
 from tensorflow.python import _pywrap_utils
 from tensorflow.python.util.compat import collections_abc as _collections_abc
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.platform import tf_logging
 
 
 _SHALLOW_TREE_HAS_INVALID_KEYS = (
@@ -122,6 +123,7 @@ _is_attrs = _pywrap_utils.IsAttrs
 _is_composite_tensor = _pywrap_utils.IsCompositeTensor
 _is_type_spec = _pywrap_utils.IsTypeSpec
 _is_mutable_mapping = _pywrap_utils.IsMutableMapping
+_is_mapping = _pywrap_utils.IsMapping
 
 
 def _sequence_like(instance, args):
@@ -145,12 +147,26 @@ def _sequence_like(instance, args):
     result = dict(zip(_sorted(instance), args))
     instance_type = type(instance)
     if instance_type == _collections.defaultdict:
-      d = instance_type(_collections.defaultdict(instance.default_factory))
+      d = _collections.defaultdict(instance.default_factory)
       for key in instance:
         d[key] = result[key]
       return d
     else:
-      return instance_type((key, result[key]) for key in instance)
+      d = instance_type()
+      for key in instance:
+        d[key] = instance[key]
+      return d
+  elif _is_mapping(instance):
+    result = dict(zip(_sorted(instance), args))
+    instance_type = type(instance)
+    tf_logging.log_first_n(
+      tf_logging.WARN, "Mapping types may not work well with tf.nest. Prefer using" 
+      "MutableMapping for {}".format(instance_type), 1
+    )
+    d = instance_type()
+    for key in instance:
+      d[key] = instance[key]
+    return d
   elif _is_mapping_view(instance):
     # We can't directly construct mapping views, so we create a list instead
     return list(args)

From b5b25992ee1b9a648edd19bd522549d64c0e9996 Mon Sep 17 00:00:00 2001
From: Puneeth K <puneethk.cs17@bmsce.ac.in>
Date: Tue, 11 Feb 2020 21:35:43 +0530
Subject: [PATCH 071/442] Updated nest.py

---
 tensorflow/python/util/nest.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 3298766b686..1aceea3ce23 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -148,14 +148,11 @@ def _sequence_like(instance, args):
     instance_type = type(instance)
     if instance_type == _collections.defaultdict:
       d = _collections.defaultdict(instance.default_factory)
-      for key in instance:
-        d[key] = result[key]
-      return d
     else:
       d = instance_type()
-      for key in instance:
-        d[key] = instance[key]
-      return d
+    for key in instance:
+        d[key] = result[key]
+    return d
   elif _is_mapping(instance):
     result = dict(zip(_sorted(instance), args))
     instance_type = type(instance)
@@ -163,10 +160,7 @@ def _sequence_like(instance, args):
       tf_logging.WARN, "Mapping types may not work well with tf.nest. Prefer using" 
       "MutableMapping for {}".format(instance_type), 1
     )
-    d = instance_type()
-    for key in instance:
-      d[key] = instance[key]
-    return d
+    return instance_type((key, result[key]) for key in instance)
   elif _is_mapping_view(instance):
     # We can't directly construct mapping views, so we create a list instead
     return list(args)

From f00f47f8d6e69728e327a47a1a0d3b3ca569addf Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Tue, 11 Feb 2020 23:37:15 +0000
Subject: [PATCH 072/442] Add licenses to license check builds

---
 tensorflow/tools/lib_package/BUILD | 6 ++++++
 tensorflow/tools/pip_package/BUILD | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index fb88a61b424..52a48c09af3 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -173,6 +173,9 @@ genrule(
         "//tensorflow:no_aws_support": [],
         "//conditions:default": [
             "@aws//:LICENSE",
+            "@aws-checksums//:LICENSE",
+            "@aws-c-event-stream//:LICENSE",
+            "@aws-c-common//:LICENSE",
         ],
     }) + select({
         "//tensorflow:android": [],
@@ -253,6 +256,9 @@ genrule(
         "//tensorflow:no_aws_support": [],
         "//conditions:default": [
             "@aws//:LICENSE",
+            "@aws-checksums//:LICENSE",
+            "@aws-c-event-stream//:LICENSE",
+            "@aws-c-common//:LICENSE",
         ],
     }) + select({
         "//tensorflow:android": [],
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 226cffa6062..7eb40cfffe7 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -177,6 +177,9 @@ filegroup(
         "//tensorflow:no_aws_support": [],
         "//conditions:default": [
             "@aws//:LICENSE",
+            "@aws-c-common//:LICENSE",
+            "@aws-c-event-stream//:LICENSE",
+            "@aws-checksums//:LICENSE",
         ],
     }) + select({
         "//tensorflow:android": [],

From 54457e58a7ac020f1446f661b72306d7d94baf1b Mon Sep 17 00:00:00 2001
From: Puneeth K <puneethk.cs17@bmsce.ac.in>
Date: Wed, 12 Feb 2020 15:52:37 +0530
Subject: [PATCH 073/442] Fixed code for pylint

---
 tensorflow/python/util/nest.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 1aceea3ce23..3449eafaad3 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -151,14 +151,14 @@ def _sequence_like(instance, args):
     else:
       d = instance_type()
     for key in instance:
-        d[key] = result[key]
+      d[key] = result[key]
     return d
   elif _is_mapping(instance):
     result = dict(zip(_sorted(instance), args))
     instance_type = type(instance)
     tf_logging.log_first_n(
-      tf_logging.WARN, "Mapping types may not work well with tf.nest. Prefer using" 
-      "MutableMapping for {}".format(instance_type), 1
+        tf_logging.WARN, "Mapping types may not work well with tf.nest. Prefer"
+        "using MutableMapping for {}".format(instance_type), 1
     )
     return instance_type((key, result[key]) for key in instance)
   elif _is_mapping_view(instance):
@@ -278,8 +278,8 @@ def flatten(structure, expand_composites=False):
   running.
 
   Args:
-    structure: an arbitrarily nested structure. Note, numpy arrays are considered
-      atoms and are not flattened.
+    structure: an arbitrarily nested structure. Note, numpy arrays are
+      considered atoms and are not flattened.
     expand_composites: If true, then composite tensors such as tf.SparseTensor
        and tf.RaggedTensor are expanded into their component tensors.
 
@@ -532,8 +532,9 @@ def map_structure(func, *structure, **kwargs):
 
   Args:
     func: A callable that accepts as many arguments as there are structures.
-    *structure: scalar, or tuple or dict or list of constructed scalars and/or other
-      tuples/lists, or scalars.  Note: numpy arrays are considered as scalars.
+    *structure: scalar, or tuple or dict or list of constructed scalars and/or
+      other tuples/lists, or scalars.  Note: numpy arrays are considered as
+      scalars.
     **kwargs: Valid keyword args are:
 
       * `check_types`: If set to `True` (default) the types of

From 720c2859afc2094f6a51f8291ff43a0165880db0 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Wed, 12 Feb 2020 17:07:24 +0100
Subject: [PATCH 074/442] Add description to ValidateTensorProperties

---
 .../tf2tensorrt/convert/convert_nodes.cc      | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index e9c587c60e0..411ef6a8312 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -250,6 +250,19 @@ void GetInputProperties(const grappler::GraphProperties& graph_properties,
   }
 }
 
+// This function checks if a tensor is compatible with TRT.
+//
+// We check that the shape and datatype is compatible with TensorRT. We also
+// return the corresponding trt_dtype, the trt_dims and the batch_size (latter
+// is only needed in implicit batch mode).
+//
+// The return status indicates wether the tensor is compatible.
+//
+// If validation_only == false, then we make an additional check. In implicit
+// batch mode we check that all inputs for the network has static shape (as
+// required by the TensorRT). The only exception is the batch size, which
+// could be unknown. In contrast, using explicit batch mode this test is not
+// necessary, since any dimension could be unknown in explicit batch mode.
 Status ValidateTensorProperties(const string& producer_node_type,
                                 const DataType dtype,
                                 const PartialTensorShape& shape,
@@ -294,11 +307,7 @@ Status ValidateTensorProperties(const string& producer_node_type,
 
   if (validation_only) return Status::OK();
 
-  // Following checks are only used during TRT engine creation time. In implicit
-  // batch mode we check that all inputs for the network has static shape (as
-  // required by the TensorRT). The only exception is the batch size, which
-  // could be unknown. In contrast, using explicit batch mode this test is not
-  // necessary, since any dimension could be unknown in explicit batch mode.
+  // Following checks are only used during TRT engine creation time.
   if (use_implicit_batch) {
     for (int d = first_trt_dim; d < shape.dims(); ++d) {
       if (shape.dim_size(d) < 0) {

From f00d2b0e931852905e9ac7c306cdcdf5e5f9dc67 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Thu, 13 Feb 2020 12:18:33 +0100
Subject: [PATCH 075/442] Improve comments, naming style, and fix copyright
 year.

---
 .../tf2tensorrt/utils/trt_lru_cache.h         |  6 ++--
 .../utils/trt_shape_optimization_profiles.cc  |  2 +-
 .../utils/trt_shape_optimization_profiles.h   |  2 +-
 .../trt_shape_optimization_profiles_test.cc   | 31 ++++++++++---------
 4 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
index c652d364485..63c2acd00bc 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
@@ -184,9 +184,9 @@ class TRTEngineCacheResource : public ResourceBase {
   // attach it to each item of the cache.
   std::unique_ptr<CalibrationContext> calib_ctx_;
 
-  // This object maintains all the optimization profiles during profile generation
-  // and engine build. We currently don't use this object during runtime, instead
-  // we deserialize the profiles out of the cached engines.
+  // This object maintains all the optimization profiles during profile
+  // generation and engine build. During runtime the list of profiles is used to
+  // look up a matching profile for the input data.
   TrtShapeOptimizationProfile profiles_;
 };
 
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
index 1646f3027f9..4e4ad0a3649 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
index b445c4b4742..5685acea15f 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
index 0fe96afc713..56a6c430279 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 
-std::vector<TensorShape> dimvec2shapevec(std::vector<nvinfer1::Dims3> dimvec) {
+std::vector<TensorShape> DimVecToShapeVec(std::vector<nvinfer1::Dims3> dimvec) {
   std::vector<TensorShape> shapevec(dimvec.size());
   for (int i = 0; i < dimvec.size(); i++) {
     TensorShape shape;
@@ -43,7 +43,7 @@ std::vector<TensorShape> dimvec2shapevec(std::vector<nvinfer1::Dims3> dimvec) {
   return shapevec;
 }
 
-bool dimsContained(const nvinfer1::Dims& dim, const nvinfer1::Dims& min,
+bool DimsContained(const nvinfer1::Dims& dim, const nvinfer1::Dims& min,
                    const nvinfer1::Dims& max) {
   if (dim.nbDims != min.nbDims || dim.nbDims != max.nbDims) {
     return false;
@@ -56,7 +56,7 @@ bool dimsContained(const nvinfer1::Dims& dim, const nvinfer1::Dims& min,
   return true;
 }
 
-bool dimsEqual(const nvinfer1::Dims& a, const nvinfer1::Dims& b) {
+bool DimsEqual(const nvinfer1::Dims& a, const nvinfer1::Dims& b) {
   if (a.nbDims != b.nbDims) {
     return false;
   }
@@ -86,7 +86,7 @@ class TrtShapeOptimizationProfileTest : public ::testing::Test {
 #endif
   }
 
-  // define a simple network: output = input1 + input2
+  // Define a simple network: output = input1 + input2.
   void DefineNetwork(nvinfer1::INetworkDefinition* network,
                      nvinfer1::Dims3& dims) {
     nvinfer1::ITensor* input1 =
@@ -147,7 +147,7 @@ TEST_F(TrtShapeOptimizationProfileTest, Static) {
   EXPECT_NE(nullptr, exec_context_[0]);
 
   std::vector<nvinfer1::Dims3> dim_vec(2, dims);
-  std::vector<TensorShape> shape_vec = dimvec2shapevec(dim_vec);
+  std::vector<TensorShape> shape_vec = DimVecToShapeVec(dim_vec);
   EXPECT_EQ(-1, profile.GetProfileNumber(shape_vec));
 }
 
@@ -166,7 +166,7 @@ TEST_F(TrtShapeOptimizationProfileTest, Dynamic) {
 
   // Simulate a profile collection phase
   for (auto dim_vec : input_profiles) {
-    std::vector<TensorShape> shape_vec = dimvec2shapevec(dim_vec);
+    std::vector<TensorShape> shape_vec = DimVecToShapeVec(dim_vec);
     profile.AddShape(shape_vec);
   }
   profile.InitProfiles();
@@ -180,14 +180,12 @@ TEST_F(TrtShapeOptimizationProfileTest, Dynamic) {
 
   profile.CreateExecutionContexts(engine.get(), exec_context_);
 
-  // Each profile has an associated execution context
-  // This test depends on the profile creation strategy:
-  // e.g. if we would introduce a default context, then the sizes will not match
+  // Each profile has an associated execution context.
   EXPECT_EQ(exec_context_.size(), input_profiles.size());
 
-  // Check if the profiles are assigned correctly
+  // Check if the profiles are assigned correctly.
   for (auto dimvec : input_profiles) {
-    std::vector<TensorShape> shape_vec = dimvec2shapevec(dimvec);
+    std::vector<TensorShape> shape_vec = DimVecToShapeVec(dimvec);
     int idx = profile.GetProfileNumber(shape_vec);
     int prof_idx = exec_context_[idx]->getOptimizationProfile();
     ASSERT_GE(prof_idx, 0);
@@ -200,8 +198,13 @@ TEST_F(TrtShapeOptimizationProfileTest, Dynamic) {
       nvinfer1::Dims opt = engine->getProfileDimensions(
           j, prof_idx, nvinfer1::OptProfileSelector::kOPT);
 
-      EXPECT_TRUE(dimsContained(dimvec[j], min, max));
-      EXPECT_TRUE(dimsEqual(dimvec[j], opt));
+      // This should always hold.
+      EXPECT_TRUE(DimsContained(dimvec[j], min, max));
+
+      // The following test depends on the profile creation strategy, and needs
+      // to be updated (disabled) if the default trategy (defined by
+      // InitProfiles) changes.
+      EXPECT_TRUE(DimsEqual(dimvec[j], opt));
     }
   }
 }

From 7f467879467d66e83834afd54db9d51f07095372 Mon Sep 17 00:00:00 2001
From: Niranjan Hasabnis <niranjan.hasabnis@intel.com>
Date: Thu, 13 Feb 2020 15:43:36 -0800
Subject: [PATCH 076/442] [Intel MKL] Updating MatMul kernels with MKLDNN 1.x
 API changes

This PR updates QMatMul and FusedMatMul MKL CPU kernels with MKLDNN 1.0 API.
It also updates MatMul and BatchMatMul BFloat16 kernels for MKL CPU with MKLDNN 1.2 API.
Some of the changes are suggested by clang formet check tool 8.0.1 version.
---
 tensorflow/core/kernels/BUILD                 |   5 +-
 .../core/kernels/mkl_batch_matmul_op.cc       | 161 +++++++-----
 tensorflow/core/kernels/mkl_matmul_op.cc      |  14 ++
 .../core/kernels/mkl_matmul_op_fused.cc       |  37 +--
 .../core/kernels/mkl_matmul_ops_common.h      | 234 +++++++++++++++---
 tensorflow/core/kernels/mkl_qmatmul_op.cc     |  75 ++++--
 6 files changed, 390 insertions(+), 136 deletions(-)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 409f52db948..00ca20c10a6 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -3951,7 +3951,10 @@ tf_kernel_library(
 tf_mkl_kernel_library(
     name = "mkl_batch_matmul_op",
     srcs = ["mkl_batch_matmul_op.cc"],
-    hdrs = ["batch_matmul_op_impl.h"],
+    hdrs = [
+        "batch_matmul_op_impl.h",
+        "mkl_matmul_ops_common.h",
+    ],
     deps = MATH_DEPS + mkl_deps(),
 )
 
diff --git a/tensorflow/core/kernels/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
index 8966260c4fe..f96f0e1183f 100644
--- a/tensorflow/core/kernels/mkl_batch_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include <vector>
 
 #include "mkl_cblas.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -39,10 +38,12 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/batch_matmul_op_impl.h"
 #include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/mkl_matmul_ops_common.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/matmul_bcast.h"
 #include "tensorflow/core/util/mkl_util.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
@@ -53,16 +54,16 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 template <typename Device, typename Scalar, bool v2_bcast>
 class BatchMatMulMkl : public OpKernel {
  public:
-  explicit BatchMatMulMkl(OpKernelConstruction *context) : OpKernel(context) {
+  explicit BatchMatMulMkl(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("adj_x", &adj_x_));
     OP_REQUIRES_OK(context, context->GetAttr("adj_y", &adj_y_));
   }
 
   virtual ~BatchMatMulMkl() {}
 
-  void Compute(OpKernelContext *ctx) override {
-    const Tensor &lhs = ctx->input(0);
-    const Tensor &rhs = ctx->input(1);
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& lhs = ctx->input(0);
+    const Tensor& rhs = ctx->input(1);
 
     if (!v2_bcast) {
       // Using V1, so check to make sure lhs and rhs dimensions are correct and
@@ -122,7 +123,7 @@ class BatchMatMulMkl : public OpKernel {
     out_shape.AddDim(lhs_rows);
     out_shape.AddDim(rhs_cols);
 
-    Tensor *out = nullptr;
+    Tensor* out = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
     if (out->NumElements() == 0) {
       return;
@@ -147,9 +148,9 @@ class BatchMatMulMkl : public OpKernel {
     std::vector<MKL_INT> ldb_array(batch_size, adj_y_ ? K : N);
     std::vector<MKL_INT> ldc_array(batch_size, N);
     std::vector<MKL_INT> group_size(1, batch_size);
-    std::vector<const Scalar *> a_array;
-    std::vector<const Scalar *> b_array;
-    std::vector<Scalar *> c_array;
+    std::vector<const Scalar*> a_array;
+    std::vector<const Scalar*> b_array;
+    std::vector<Scalar*> c_array;
     a_array.reserve(batch_size);
     b_array.reserve(batch_size);
     c_array.reserve(batch_size);
@@ -163,8 +164,8 @@ class BatchMatMulMkl : public OpKernel {
     } else {
       // Broadcasting is needed, so get the mapping from flattened output batch
       // indices to x's and y's flattened batch indices.
-      const std::vector<int64> &a_batch_indices = bcast.x_batch_indices();
-      const std::vector<int64> &b_batch_indices = bcast.y_batch_indices();
+      const std::vector<int64>& a_batch_indices = bcast.x_batch_indices();
+      const std::vector<int64>& b_batch_indices = bcast.y_batch_indices();
 
       for (int64 i = 0; i < batch_size; i++) {
         a_array.push_back(&lhs_reshaped(a_batch_indices[i], 0, 0));
@@ -173,96 +174,121 @@ class BatchMatMulMkl : public OpKernel {
       }
     }
 
-    MklCblasGemmBatch(CblasRowMajor, adj_x_, adj_y_, &m_array[0], &n_array[0],
-                      &k_array[0], &a_array[0], &lda_array[0], &b_array[0],
-                      &ldb_array[0], &c_array[0], &ldc_array[0], 1,
-                      &group_size[0]);
+    MklCblasGemmBatch(CblasRowMajor, adj_x_, adj_y_, m_array, n_array, k_array,
+                      &a_array[0], lda_array, &b_array[0], ldb_array,
+                      &c_array[0], ldc_array, 1, group_size);
   }
 
  private:
   bool adj_x_;
   bool adj_y_;
 
-  void MklCblasGemmBatch(const CBLAS_LAYOUT Layout, const bool TransA,
-                         const bool TransB, const MKL_INT *M_Array,
-                         const MKL_INT *N_Array, const MKL_INT *K_Array,
-                         const float **A_Array, const MKL_INT *lda_Array,
-                         const float **B_Array, const MKL_INT *ldb_Array,
-                         float **C_Array, const MKL_INT *ldc_Array,
-                         const MKL_INT group_count, const MKL_INT *group_size) {
+  void MklCblasGemmBatch(
+      const CBLAS_LAYOUT Layout, const bool TransA, const bool TransB,
+      const std::vector<MKL_INT>& M_Array, const std::vector<MKL_INT>& N_Array,
+      const std::vector<MKL_INT>& K_Array, const float** A_Array,
+      const std::vector<MKL_INT>& lda_Array, const float** B_Array,
+      const std::vector<MKL_INT>& ldb_Array, float** C_Array,
+      const std::vector<MKL_INT>& ldc_Array, const MKL_INT group_count,
+      const std::vector<MKL_INT>& group_size) {
     std::vector<CBLAS_TRANSPOSE> TransA_Array(
         group_size[0], TransA ? CblasTrans : CblasNoTrans);
     std::vector<CBLAS_TRANSPOSE> TransB_Array(
         group_size[0], TransB ? CblasTrans : CblasNoTrans);
     std::vector<float> alpha_Array(group_size[0], 1.0);
     std::vector<float> beta_Array(group_size[0], 0.0);
-    cblas_sgemm_batch(Layout, &TransA_Array[0], &TransB_Array[0], M_Array,
-                      N_Array, K_Array, &alpha_Array[0], A_Array, lda_Array,
-                      B_Array, ldb_Array, &beta_Array[0], C_Array, ldc_Array,
-                      group_count, group_size);
+    cblas_sgemm_batch(Layout, &TransA_Array[0], &TransB_Array[0], &M_Array[0],
+                      &N_Array[0], &K_Array[0], &alpha_Array[0], A_Array,
+                      &lda_Array[0], B_Array, &ldb_Array[0], &beta_Array[0],
+                      C_Array, &ldc_Array[0], group_count, &group_size[0]);
   }
 
-  void MklCblasGemmBatch(const CBLAS_LAYOUT Layout, const bool TransA,
-                         const bool TransB, const MKL_INT *M_Array,
-                         const MKL_INT *N_Array, const MKL_INT *K_Array,
-                         const double **A_Array, const MKL_INT *lda_Array,
-                         const double **B_Array, const MKL_INT *ldb_Array,
-                         double **C_Array, const MKL_INT *ldc_Array,
-                         const MKL_INT group_count, const MKL_INT *group_size) {
+#ifdef ENABLE_MKLDNN_V1_2
+  void MklCblasGemmBatch(
+      const CBLAS_LAYOUT Layout, const bool TransA, const bool TransB,
+      const std::vector<MKL_INT>& M_Array, const std::vector<MKL_INT>& N_Array,
+      const std::vector<MKL_INT>& K_Array, const bfloat16** A_Array,
+      const std::vector<MKL_INT>& lda_Array, const bfloat16** B_Array,
+      const std::vector<MKL_INT>& ldb_Array, bfloat16** C_Array,
+      const std::vector<MKL_INT>& ldc_Array, const MKL_INT group_count,
+      const std::vector<MKL_INT>& group_size) {
+    std::vector<CBLAS_TRANSPOSE> TransA_Array(group_size[0], TransA);
+    std::vector<CBLAS_TRANSPOSE> TransB_Array(group_size[0], TransB);
+    std::vector<float> alpha_Array(group_size[0], 1.0);
+    std::vector<float> beta_Array(group_size[0], 0.0);
+    dnnl_gemm_batch<bfloat16>(Layout, TransA_Array, TransB_Array, M_Array,
+                              N_Array, K_Array, alpha_Array, A_Array, lda_Array,
+                              B_Array, ldb_Array, beta_Array, C_Array,
+                              ldc_Array, group_count, group_size);
+  }
+#endif  // ENABLE_MKLDNN_V1_2
+
+  void MklCblasGemmBatch(
+      const CBLAS_LAYOUT Layout, const bool TransA, const bool TransB,
+      const std::vector<MKL_INT>& M_Array, const std::vector<MKL_INT>& N_Array,
+      const std::vector<MKL_INT>& K_Array, const double** A_Array,
+      const std::vector<MKL_INT>& lda_Array, const double** B_Array,
+      const std::vector<MKL_INT>& ldb_Array, double** C_Array,
+      const std::vector<MKL_INT>& ldc_Array, const MKL_INT group_count,
+      const std::vector<MKL_INT>& group_size) {
     std::vector<CBLAS_TRANSPOSE> TransA_array(
         group_size[0], TransA ? CblasTrans : CblasNoTrans);
     std::vector<CBLAS_TRANSPOSE> TransB_array(
         group_size[0], TransB ? CblasTrans : CblasNoTrans);
     std::vector<double> alpha_Array(group_size[0], 1.0);
     std::vector<double> beta_Array(group_size[0], 0.0);
-    cblas_dgemm_batch(Layout, &TransA_array[0], &TransB_array[0], M_Array,
-                      N_Array, K_Array, &alpha_Array[0], A_Array, lda_Array,
-                      B_Array, ldb_Array, &beta_Array[0], C_Array, ldc_Array,
-                      group_count, group_size);
+    cblas_dgemm_batch(Layout, &TransA_array[0], &TransB_array[0], &M_Array[0],
+                      &N_Array[0], &K_Array[0], &alpha_Array[0], A_Array,
+                      &lda_Array[0], B_Array, &ldb_Array[0], &beta_Array[0],
+                      C_Array, &ldc_Array[0], group_count, &group_size[0]);
   }
 
-  void MklCblasGemmBatch(const CBLAS_LAYOUT Layout, const bool TransA,
-                         const bool TransB, const MKL_INT *M_Array,
-                         const MKL_INT *N_Array, const MKL_INT *K_Array,
-                         const complex64 **A_Array, const MKL_INT *lda_Array,
-                         const complex64 **B_Array, const MKL_INT *ldb_Array,
-                         complex64 **C_Array, const MKL_INT *ldc_Array,
-                         const MKL_INT group_count, const MKL_INT *group_size) {
+  void MklCblasGemmBatch(
+      const CBLAS_LAYOUT Layout, const bool TransA, const bool TransB,
+      const std::vector<MKL_INT>& M_Array, const std::vector<MKL_INT>& N_Array,
+      const std::vector<MKL_INT>& K_Array, const complex64** A_Array,
+      const std::vector<MKL_INT>& lda_Array, const complex64** B_Array,
+      const std::vector<MKL_INT>& ldb_Array, complex64** C_Array,
+      const std::vector<MKL_INT>& ldc_Array, const MKL_INT group_count,
+      const std::vector<MKL_INT>& group_size) {
     std::vector<CBLAS_TRANSPOSE> TransA_array(
         group_size[0], TransA ? CblasConjTrans : CblasNoTrans);
     std::vector<CBLAS_TRANSPOSE> TransB_array(
         group_size[0], TransB ? CblasConjTrans : CblasNoTrans);
     std::vector<complex64> alpha_Array(group_size[0], {1.0f, 0.0f});
     std::vector<complex64> beta_Array(group_size[0], {0.0f, 0.0f});
-    cblas_cgemm_batch(
-        Layout, &TransA_array[0], &TransB_array[0], M_Array, N_Array, K_Array,
-        static_cast<const void *>(&alpha_Array[0]),
-        reinterpret_cast<const void **>(A_Array), lda_Array,
-        reinterpret_cast<const void **>(B_Array), ldb_Array,
-        static_cast<const void *>(&beta_Array[0]),
-        reinterpret_cast<void **>(C_Array), ldc_Array, group_count, group_size);
+    cblas_cgemm_batch(Layout, &TransA_array[0], &TransB_array[0], &M_Array[0],
+                      &N_Array[0], &K_Array[0],
+                      static_cast<const void*>(&alpha_Array[0]),
+                      reinterpret_cast<const void**>(A_Array), &lda_Array[0],
+                      reinterpret_cast<const void**>(B_Array), &ldb_Array[0],
+                      static_cast<const void*>(&beta_Array[0]),
+                      reinterpret_cast<void**>(C_Array), &ldc_Array[0],
+                      group_count, &group_size[0]);
   }
 
-  void MklCblasGemmBatch(const CBLAS_LAYOUT Layout, const bool TransA,
-                         const bool TransB, const MKL_INT *M_Array,
-                         const MKL_INT *N_Array, const MKL_INT *K_Array,
-                         const complex128 **A_Array, const MKL_INT *lda_Array,
-                         const complex128 **B_Array, const MKL_INT *ldb_Array,
-                         complex128 **C_Array, const MKL_INT *ldc_Array,
-                         const MKL_INT group_count, const MKL_INT *group_size) {
+  void MklCblasGemmBatch(
+      const CBLAS_LAYOUT Layout, const bool TransA, const bool TransB,
+      const std::vector<MKL_INT>& M_Array, const std::vector<MKL_INT>& N_Array,
+      const std::vector<MKL_INT>& K_Array, const complex128** A_Array,
+      const std::vector<MKL_INT>& lda_Array, const complex128** B_Array,
+      const std::vector<MKL_INT>& ldb_Array, complex128** C_Array,
+      const std::vector<MKL_INT>& ldc_Array, const MKL_INT group_count,
+      const std::vector<MKL_INT>& group_size) {
     std::vector<CBLAS_TRANSPOSE> TransA_array(
         group_size[0], TransA ? CblasConjTrans : CblasNoTrans);
     std::vector<CBLAS_TRANSPOSE> TransB_array(
         group_size[0], TransB ? CblasConjTrans : CblasNoTrans);
     std::vector<complex128> alpha_Array(group_size[0], {1.0f, 0.0f});
     std::vector<complex128> beta_Array(group_size[0], {0.0f, 0.0f});
-    cblas_zgemm_batch(
-        Layout, &TransA_array[0], &TransB_array[0], M_Array, N_Array, K_Array,
-        static_cast<const void *>(&alpha_Array[0]),
-        reinterpret_cast<const void **>(A_Array), lda_Array,
-        reinterpret_cast<const void **>(B_Array), ldb_Array,
-        static_cast<const void *>(&beta_Array[0]),
-        reinterpret_cast<void **>(C_Array), ldc_Array, group_count, group_size);
+    cblas_zgemm_batch(Layout, &TransA_array[0], &TransB_array[0], &M_Array[0],
+                      &N_Array[0], &K_Array[0],
+                      static_cast<const void*>(&alpha_Array[0]),
+                      reinterpret_cast<const void**>(A_Array), &lda_Array[0],
+                      reinterpret_cast<const void**>(B_Array), &ldb_Array[0],
+                      static_cast<const void*>(&beta_Array[0]),
+                      reinterpret_cast<void**>(C_Array), &ldc_Array[0],
+                      group_count, &group_size[0]);
   }
 };
 
@@ -290,6 +316,11 @@ TF_CALL_float(REGISTER_BATCH_MATMUL_MKL_V2);
 TF_CALL_double(REGISTER_BATCH_MATMUL_MKL_V2);
 TF_CALL_complex64(REGISTER_BATCH_MATMUL_MKL_V2);
 TF_CALL_complex128(REGISTER_BATCH_MATMUL_MKL_V2);
+
+#ifdef ENABLE_MKLDNN_V1_2
+TF_CALL_bfloat16(REGISTER_BATCH_MATMUL_MKL);
+TF_CALL_bfloat16(REGISTER_BATCH_MATMUL_MKL_V2);
+#endif  // ENABLE_MKLDNN_V1_2
 #endif  // ENABLE_MKL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl_matmul_op.cc
index 714a1de0837..83d8255bdaa 100644
--- a/tensorflow/core/kernels/mkl_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_matmul_op.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/mkl_matmul_ops_common.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 // This header file is part of MKL ML, need equivalent file in MKL DNN
@@ -183,6 +184,15 @@ class MklMatMulOp : public OpKernel {
     const int index_transa = transa ? 1 : 0;
     const int index_transb = transb ? 1 : 0;
 
+#ifdef ENABLE_MKLDNN_V1
+#ifdef ENABLE_MKLDNN_V1_2
+    dnnl_gemm<bfloat16>(transa ? CblasTrans : CblasNoTrans,
+                        transb ? CblasTrans : CblasNoTrans, m, n, k, alpha, a,
+                        lda, b, ldb, beta, c, ldc);
+#else
+// There is no MatMul support for bfloat16 type in MKLDNN1.0.
+#endif  // ENABLE_MKLDNN_V1_2
+#else
     Tensor c_float;
     OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, {m, n}, &c_float));
 
@@ -195,6 +205,7 @@ class MklMatMulOp : public OpKernel {
                             &beta, c_float.flat<float>().data(), &ldc);
 
     FloatToBFloat16(c_float.flat<float>().data(), c, c_float.NumElements());
+#endif  // ENABLE_MKLDNN_V1
   }
 
 // MKL-DNN only supports SGEMM and bfloat16-GEMM.
@@ -257,7 +268,10 @@ class MklMatMulOp : public OpKernel {
 // TODO(inteltf) Consider template specialization when adding/removing
 // additional types
 TF_CALL_float(REGISTER_CPU);
+#ifndef ENABLE_MKLDNN_V1
+// MKLDNNv1 does not have support for bfloat16 GEMM. Only V1.2 has that support.
 TF_CALL_bfloat16(REGISTER_CPU);
+#endif  // ENABLE_MKLDNN_V1
 
 #ifndef INTEL_MKL_DNN_ONLY
 TF_CALL_double(REGISTER_CPU);
diff --git a/tensorflow/core/kernels/mkl_matmul_op_fused.cc b/tensorflow/core/kernels/mkl_matmul_op_fused.cc
index 02495f672d2..755919d8e68 100644
--- a/tensorflow/core/kernels/mkl_matmul_op_fused.cc
+++ b/tensorflow/core/kernels/mkl_matmul_op_fused.cc
@@ -103,12 +103,11 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T> {
     memory::dims weight_dims = memory::dims({channel, k});
     memory::dims bias_dims = memory::dims({channel});
     memory::dims dst_dims = memory::dims({batch, channel});
-    memory::format weight_format =
-        transpose_b_ ? memory::format::oi : memory::format::io;
+    MEMORY_FORMAT weight_format =
+        transpose_b_ ? MEMORY_FORMAT::oi : MEMORY_FORMAT::io;
 
     MklDnnMatMulFwdParams matmul_params(src_dims, weight_dims, bias_dims,
                                         dst_dims, weight_format);
-
     // Extend the basic parameters for data types and fusions.
     ExtendMklDnnMatMulFwdParams(ctx, matmul_params);
     MklDnnMatMulFwdPrimitive<T, T, T, T, T>* matmul_prim =
@@ -120,8 +119,8 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T> {
         matmul_prim->GetPrimitiveDesc();
 
     if (src_mkl_shape.IsMklTensor() && weight_mkl_shape.IsMklTensor()) {
-      this->AllocateOutputTensor(ctx, *matmul_pd, dst_dims, memory::format::nc,
-                                 &dst_tensor);
+      this->AllocateOutputTensor(ctx, *matmul_pd, dst_dims,
+                                 MKL_TENSOR_FORMAT_NC, &dst_tensor);
     } else {
       TensorShape dst_tensor_shape({batch, channel});
       MklDnnShape dst_mkl_shape;
@@ -148,26 +147,34 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T> {
 
       if (src_mkl_shape.IsMklTensor()) {
         memory::desc input_md = src_mkl_shape.GetMklLayout();
-
-        if (input_md.data.format != memory::format::nc) {
+#ifdef ENABLE_MKLDNN_V1
+        if (input_md != matmul_pd->src_desc()) {
+#else
+        if (input_md.data.format != MKL_TENSOR_FORMAT_NC) {
+#endif
           src_mkl.SetUsrMem(input_md, src_data);
-          src_mkl.CheckReorderToOpMem(matmul_pd.get()->src_primitive_desc());
+          src_mkl.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
+              matmul_pd.get()->PRIMITIVE_DESC_SRC, this->cpu_engine_));
           src_data = reinterpret_cast<T*>(src_mkl.GetOpMem().get_data_handle());
         }
       }
 
       if (weight_mkl_shape.IsMklTensor()) {
         memory::desc input_md = weight_mkl_shape.GetMklLayout();
-
+#ifdef ENABLE_MKLDNN_V1
+        if (input_md != matmul_pd->weight_desc()) {
+#else
         if (input_md.data.format != weight_format) {
+#endif
           weight_mkl.SetUsrMem(input_md, weight_data);
-          weight_mkl.CheckReorderToOpMem(
-              matmul_pd.get()->weights_primitive_desc());
+          weight_mkl.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
+              matmul_pd.get()->PRIMITIVE_DESC_WEIGHTS, this->cpu_engine_));
           weight_data =
               reinterpret_cast<T*>(weight_mkl.GetOpMem().get_data_handle());
         }
       }
 
+      // Execute fused matmul op.
       matmul_prim->Execute(src_data, weight_data, bias_data, dst_data);
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
@@ -180,21 +187,23 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T> {
 
   void ExtendMklDnnMatMulFwdParams(OpKernelContext* ctx,
                                    MklDnnMatMulFwdParams& params) {
+#ifndef ENABLE_MKL_DNN_V1
     if (fused_ops_.size() == 2) {
       string post_op = fused_ops_[1];
 
       if (post_op == "Relu") {
-        params.post_op_params.push_back({"relu", {1.0, 0.0, 0.0}});
+        params.post_op_params.push_back({"relu", { 1.0, 0.0, 0.0 }});
       } else if (post_op == "Relu6") {
-        params.post_op_params.push_back({"relu6", {1.0, 6.0, 0.0}});
+        params.post_op_params.push_back({"relu6", { 1.0, 6.0, 0.0 }});
       } else if (post_op == "Elu") {
-        params.post_op_params.push_back({"elu", {1.0, 1.0, 0.0}});
+        params.post_op_params.push_back({"elu", { 1.0, 1.0, 0.0 }});
       } else {
         OP_REQUIRES_OK(
             ctx, errors::InvalidArgument(
                      "Unsupported post-argument in MklFusedMatMul: ", post_op));
       }
     }
+#endif
   }
 
  private:
diff --git a/tensorflow/core/kernels/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl_matmul_ops_common.h
index f7666d59883..44eecc65b94 100644
--- a/tensorflow/core/kernels/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl_matmul_ops_common.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mkldnn.hpp"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 using mkldnn::inner_product_forward;
@@ -40,7 +41,7 @@ struct MklDnnMatMulFwdParams {
   memory::dims weight_dims;
   memory::dims bias_dims;
   memory::dims dst_dims;
-  memory::format weight_fmt;
+  MEMORY_FORMAT weight_fmt;
   string dtypes = string("");
   struct PostOpParam {
     string name;
@@ -50,7 +51,7 @@ struct MklDnnMatMulFwdParams {
 
   MklDnnMatMulFwdParams(memory::dims src_dims, memory::dims weight_dims,
                         memory::dims bias_dims, memory::dims dst_dims,
-                        memory::format weight_fmt = memory::format::any)
+                        MEMORY_FORMAT weight_fmt = MEMORY_FORMAT::any)
       : src_dims(src_dims),
         weight_dims(weight_dims),
         bias_dims(bias_dims),
@@ -70,8 +71,8 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
  public:
   explicit MklDnnMatMulFwdPrimitive(
       const MklDnnMatMulFwdParams& matmulFwdParams)
-      : cpu_engine_(engine::cpu, 0) {
-    context_.fwd_stream.reset(new stream(stream::kind::eager));
+      : cpu_engine_(ENGINE_CPU, 0) {
+    context_.fwd_stream.reset(new CPU_STREAM(cpu_engine_));
     // Create matmul primitive
     if (context_.matmul_fwd == nullptr) {
       Setup(matmulFwdParams);
@@ -94,7 +95,16 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
     context_.bias_mem->set_data_handle(
         static_cast<void*>(const_cast<Tbias*>(bias_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
+
+#ifdef ENABLE_MKLDNN_V1
+    DCHECK_EQ(context_.fwd_primitives.size(), context_.net_args.size());
+    for (size_t i = 0; i < context_.fwd_primitives.size(); ++i) {
+      context_.fwd_primitives.at(i).execute(*context_.fwd_stream,
+                                            context_.net_args.at(i));
+    }
+#else
     context_.fwd_stream->submit(context_.fwd_primitives);
+#endif  // ENABLE_MKLDNN_V1
 
     // After execution, set data handle back
     context_.src_mem->set_data_handle(DummyData);
@@ -103,8 +113,13 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
     context_.dst_mem->set_data_handle(DummyData);
   }
 
+#ifndef ENABLE_MKLDNN_V1
+  // In MKL-DNN v1.x, memory format tags only provide a partial description
+  // of the memory layout. Hence, these functions are disabled for v1.x.
   memory::format GetSrcMemoryFormat() const { return context_.src_fmt; }
   memory::format GetweightMemoryFormat() const { return context_.weight_fmt; }
+#endif  // ENABLE_MKLDNN_V1
+
   std::shared_ptr<mkldnn::inner_product_forward::primitive_desc>
   GetPrimitiveDesc() const {
     return context_.fwd_pd;
@@ -113,34 +128,43 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
  private:
   // Primitive reuse context for inner-product Fwd op
   struct MklDnnMatMulFwdContext {
+#ifndef ENABLE_MKLDNN_V1
     // Expected memory format for this primitive instance
-    memory::format src_fmt;
-    memory::format weight_fmt;
+    MEMORY_FORMAT src_fmt;
+    MEMORY_FORMAT weight_fmt;
+#endif  // ENABLE_MKLDNN_V1
 
-    // MKL-DNN memory
+    // MKL-DNN memory.
     std::shared_ptr<mkldnn::memory> src_mem;
     std::shared_ptr<mkldnn::memory> weight_mem;
     std::shared_ptr<mkldnn::memory> bias_mem;
     std::shared_ptr<mkldnn::memory> dst_mem;
 
-    // Descriptor and primitive-descriptor for forward inner-product
+    // Descriptor and primitive-descriptor for forward inner-product.
     std::shared_ptr<mkldnn::inner_product_forward::desc> fwd_desc;
     std::shared_ptr<mkldnn::inner_product_forward::primitive_desc> fwd_pd;
 
-    // Memory descriptors
+    // Memory descriptors.
     std::shared_ptr<mkldnn::memory::desc> src_md;
     std::shared_ptr<mkldnn::memory::desc> weight_md;
     std::shared_ptr<mkldnn::memory::desc> bias_md;
     std::shared_ptr<mkldnn::memory::desc> dst_md;
 
-    // Inner-product primitive
+    // Inner-product primitive.
     std::shared_ptr<mkldnn::primitive> matmul_fwd;
     std::shared_ptr<mkldnn::stream> fwd_stream;
     std::vector<mkldnn::primitive> fwd_primitives;
 
+#ifdef ENABLE_MKLDNN_V1
+    std::vector<std::unordered_map<int, memory>> net_args;
+#endif  // ENABLE_MKLDNN_V1
+
     MklDnnMatMulFwdContext()
-        : src_fmt(memory::format::any),
-          weight_fmt(memory::format::any),
+        :
+#ifndef ENABLE_MKLDNN_V1
+          src_fmt(MEMORY_FORMAT::any),
+          weight_fmt(MEMORY_FORMAT::any),
+#endif  // ENABLE_MKLDNN_V1
           src_mem(nullptr),
           weight_mem(nullptr),
           bias_mem(nullptr),
@@ -152,32 +176,39 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
           bias_md(nullptr),
           dst_md(nullptr),
           matmul_fwd(nullptr),
-          fwd_stream(nullptr) {}
+          fwd_stream(nullptr) {
+    }
   };
 
   void Setup(const MklDnnMatMulFwdParams& matmul_fwd_params) {
-    // Create memory descriptors for inner-product data with no specified format
+    // Create memory descriptors for inner-product data without specified
+    // format.
     context_.src_md.reset(new memory::desc({matmul_fwd_params.src_dims},
                                            MklDnnType<Tinput>(),
-                                           memory::format::any));
+                                           MEMORY_FORMAT::any));
 
     context_.weight_md.reset(new memory::desc({matmul_fwd_params.weight_dims},
                                               MklDnnType<Tweight>(),
+#ifdef ENABLE_MKLDNN_V1
+                                              MEMORY_FORMAT::any));
+#else
                                               matmul_fwd_params.weight_fmt));
+#endif
 
     context_.dst_md.reset(new memory::desc({matmul_fwd_params.dst_dims},
                                            MklDnnType<Toutput>(),
-                                           memory::format::any));
+                                           MEMORY_FORMAT::any));
 
     context_.bias_md.reset(new memory::desc({matmul_fwd_params.bias_dims},
                                             MklDnnType<Tbias>(),
-                                            memory::format::any));
-    // Create an inner-product
+                                            MEMORY_FORMAT::any));
+    // Create an inner-product.
     context_.fwd_desc.reset(new inner_product_forward::desc(
         prop_kind::forward_inference, *context_.src_md, *context_.weight_md,
         *context_.bias_md, *context_.dst_md));
     context_.fwd_pd.reset(new inner_product_forward::primitive_desc(
         *context_.fwd_desc, cpu_engine_));
+
     // Check if there is any fusion as post-ops
     auto const& post_op_params = matmul_fwd_params.post_op_params;
     mkldnn::primitive_attr post_ops_attr;
@@ -189,21 +220,21 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
           float op_scale = post_op_param.param[0];
           float op_alpha = post_op_param.param[1];
           float op_beta = post_op_param.param[2];
-          post_ops.append_eltwise(op_scale, mkldnn::eltwise_relu, op_alpha,
+          post_ops.append_eltwise(op_scale, ALGORITHM::eltwise_relu, op_alpha,
                                   op_beta);
         } else if (post_op_param.name == "relu6") {
           DCHECK_EQ(post_op_param.param.size(), 3);
           float op_scale = post_op_param.param[0];
           float op_alpha = post_op_param.param[1];
           float op_beta = post_op_param.param[2];
-          post_ops.append_eltwise(op_scale, mkldnn::eltwise_bounded_relu,
+          post_ops.append_eltwise(op_scale, ALGORITHM::eltwise_bounded_relu,
                                   op_alpha, op_beta);
         } else if (post_op_param.name == "elu") {
           DCHECK_EQ(post_op_param.param.size(), 3);
           float op_scale = post_op_param.param[0];
           float op_alpha = post_op_param.param[1];
           float op_beta = post_op_param.param[2];
-          post_ops.append_eltwise(op_scale, mkldnn::eltwise_elu, op_alpha,
+          post_ops.append_eltwise(op_scale, ALGORITHM::eltwise_elu, op_alpha,
                                   op_beta);
         } else if (post_op_param.name == "output_scale") {
           DCHECK_EQ(post_op_param.param.size(), 1);
@@ -225,30 +256,39 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
           *context_.fwd_desc, cpu_engine_));
     }
 
-    // Store the expected memory format
+#ifndef ENABLE_MKLDNN_V1
+    // Store the expected memory format.
     context_.src_fmt = static_cast<mkldnn::memory::format>(
         context_.fwd_pd.get()->src_primitive_desc().desc().data.format);
 
     context_.weight_fmt = static_cast<mkldnn::memory::format>(
         context_.fwd_pd.get()->weights_primitive_desc().desc().data.format);
+#endif
 
     // Create memory primitive based on dummy data
-    context_.src_mem.reset(
-        new memory(context_.fwd_pd.get()->src_primitive_desc(), DummyData));
-    context_.weight_mem.reset(
-        new memory(context_.fwd_pd.get()->weights_primitive_desc(), DummyData));
-    context_.dst_mem.reset(
-        new memory(context_.fwd_pd.get()->dst_primitive_desc(), DummyData));
-    context_.bias_mem.reset(new memory({{{matmul_fwd_params.bias_dims},
-                                         MklDnnType<Tbias>(),
-                                         memory::format::x},
-                                        cpu_engine_},
-                                       DummyData));
+    context_.src_mem.reset(new MEMORY_CONSTRUCTOR(
+        context_.fwd_pd.get()->PRIMITIVE_DESC_SRC, cpu_engine_, DummyData));
+    context_.weight_mem.reset(new MEMORY_CONSTRUCTOR(
+        context_.fwd_pd.get()->PRIMITIVE_DESC_WEIGHTS, cpu_engine_, DummyData));
+    context_.dst_mem.reset(new MEMORY_CONSTRUCTOR(
+        context_.fwd_pd.get()->PRIMITIVE_DESC_DST, cpu_engine_, DummyData));
+    context_.bias_mem.reset(new MEMORY_CONSTRUCTOR_USING_MEM_PD(
+        matmul_fwd_params.bias_dims, Tbias, MEMORY_FORMAT::x, cpu_engine_,
+        DummyData));
 
-    // Create inner-product primitive
+#ifdef ENABLE_MKLDNN_V1
+    // Create inner-product primitive.
+    context_.matmul_fwd.reset(new inner_product_forward(*context_.fwd_pd));
+    context_.net_args.push_back({{MKLDNN_ARG_SRC, *context_.src_mem},
+                                 {MKLDNN_ARG_WEIGHTS, *context_.weight_mem},
+                                 {MKLDNN_ARG_BIAS, *context_.bias_mem},
+                                 { MKLDNN_ARG_DST,
+                                   *context_.dst_mem }});
+#else
     context_.matmul_fwd.reset(new inner_product_forward(
         *context_.fwd_pd, *context_.src_mem, *context_.weight_mem,
         *context_.bias_mem, *context_.dst_mem));
+#endif
 
     context_.fwd_primitives.push_back(*context_.matmul_fwd);
     return;
@@ -355,9 +395,9 @@ class MklDnnMatMulOpBase : public OpKernel {
       OpKernelContext* context,
       const inner_product_forward::primitive_desc& mkldnn_matmul_prim_desc,
       const memory::dims& output_dims_mkl_order,
-      memory::format output_tf_format, Tensor** output_tensor) {
+      MKL_TENSOR_FORMAT output_tf_format, Tensor** output_tensor) {
     DCHECK(output_tensor);
-    auto dst_pd = mkldnn_matmul_prim_desc.dst_primitive_desc();
+    auto dst_pd = mkldnn_matmul_prim_desc.PRIMITIVE_DESC_DST;
 
     MklDnnShape output_mkl_shape;
     output_mkl_shape.SetMklTensor(true);
@@ -374,7 +414,7 @@ class MklDnnMatMulOpBase : public OpKernel {
                               output_tf_shape, output_mkl_shape);
   }
 
-  engine cpu_engine_ = engine(engine::cpu, 0);
+  engine cpu_engine_ = engine(ENGINE_CPU, 0);
 
  protected:
   const int kInputIndexSrc = 0;
@@ -383,6 +423,126 @@ class MklDnnMatMulOpBase : public OpKernel {
   const int kOutputIndexDst = 0;
 };
 
+#ifdef ENABLE_MKLDNN_V1_2
+// MatMul support for bfloat16 and int8 types is introduced in DNNLv1.2.
+// We will enable this macro when we port our changes to DNNLv1.2.
+namespace {
+
+void dnnl_gemm_exec(const dnnl::desc& a_md, const dnnl::desc& b_md,
+                    const dnnl::desc& c_md, void* a, void* b, void* c,
+                    const dnnl::primitive_attr& attr) {
+  // Create a MatMul primitive
+  dnnl::engine cpu_engine = mkldnn::engine(ENGINE_CPU, 0);
+  dnnl::matmul::desc matmul_desc(a_md, b_md, c_md);
+  dnnl::matmul::primitive_desc matmul_pd(matmul_desc, attr, cpu_engine);
+  dnnl::matmul matmul_prim(matmul_pd);
+  // Wrap raw pointers into DNNL memory objects
+  dnnl::memory a_memory(a_md, cpu_engine, a);
+  dnnl::memory b_memory(b_md, cpu_engine, b);
+  dnnl::memory c_memory(c_md, cpu_engine, c);
+  // Execute the MatMul primitive.
+  // Since here all shapes and parameters are static, please note that we
+  // don't need to pass alpha (scales) again, as they are already hard-coded
+  // in the primitive descriptor. Also, we are not allowed to change the
+  // shapes of matrices A, B, and C -- they should exactly match
+  // the memory descriptors passed to MatMul operation descriptor.
+  dnnl::stream s(cpu_engine);
+  matmul_prim.execute(s, {{DNNL_ARG_SRC, a_memory},
+                          {DNNL_ARG_WEIGHTS, b_memory},
+                          {DNNL_ARG_DST, c_memory}});
+  s.wait();
+}
+
+template <typename T>
+void dnnl_gemm_batch(const std::vector<bool>& transa,
+                     const std::vector<bool>& transb,
+                     const std::vector<int64_t>& m,
+                     const std::vector<int64_t>& n,
+                     const std::vector<int64_t>& k,
+                     const std::vector<float>& alpha, const T** a,
+                     const std::vector<int64_t> lda, const T** b,
+                     const std::vector<int64_t>& ldb, const float* beta, T** c,
+                     const std::vector<int64_t>& ldc, const int64_t group_count,
+                     const std::vector<int64_t>& group_size) {
+  // Current BatchMatMul support in Tensorflow is narrower than the one offered
+  // by MKL and MKL-DNN. Current BatchMatMul support in Tensorflow uses only 1
+  // group of size equal to batch_size, and all MatMul parameters (m, n, k,
+  // lda, ldb, ldc, alpha, beta) within that group are same.
+  DCHECK(group_size.size() == 1);
+  DCHECK(transa.size() == group_size[0]);
+  DCHECK(transb.size() == group_size[0]);
+  DCHECK(alpha.size() == group_size[0]);
+  DCHECK(beta.size() == group_size[0]);
+  DCHECK(m.size() == group_size[0]);
+  DCHECK(n.size() == group_size[0]);
+  DCHECK(k.size() == group_size[0]);
+  DCHECK(lda.size() == group_size[0]);
+  DCHECK(ldb.size() == group_size[0]);
+  DCHECK(ldc.size() == group_size[0]);
+  for (int64_t idx = 0; idx < group_size[0]; idx++)
+    DCHECK(transa[0] == transa[idx]);
+  for (int64_t idx = 0; idx < group_size[0]; idx++)
+    DCHECK(transb[0] == transb[idx]);
+  for (int64_t idx = 0; idx < group_size[0]; idx++)
+    DCHECK(alpha[0] == alpha[idx]);
+  for (int64_t idx = 0; idx < group_size[0]; idx++)
+    DCHECK(beta[0] == beta[idx]);
+  for (int64_t idx = 0; idx < group_size[0]; idx++) DCHECK(m[0] == m[idx]);
+  for (int64_t idx = 0; idx < group_size[0]; idx++) DCHECK(n[0] == n[idx]);
+  for (int64_t idx = 0; idx < group_size[0]; idx++) DCHECK(k[0] == k[idx]);
+  for (int64_t idx = 0; idx < group_size[0]; idx++) DCHECK(lda[0] == lda[idx]);
+  for (int64_t idx = 0; idx < group_size[0]; idx++) DCHECK(ldb[0] == ldb[idx]);
+  for (int64_t idx = 0; idx < group_size[0]; idx++) DCHECK(ldc[0] == ldc[idx]);
+
+  using dims = dnnl::memory::dims;
+  // Prepare strides based on the transa and transb flags: transposed
+  // matrices have strides swapped BatchMatMul in MKL-DNN supports 3D metrices
+  // so far. That is why strides are 3D also.
+  dims a_strides = transa[0] ? dims{lda[0], 1, 1} : dims{1, 1, lda[0]};
+  dims b_strides = transb[0] ? dims{ldb[0], 1, 1} : dims{1, 1, ldb[0]};
+  dims c_strides = dims{ldc[0], 1, 1};
+  // Prepare memory descriptors
+  dnnl::desc a_md({group_size[0], m[0], k[0]}, MklDnnType<T>(), a_strides);
+  dnnl::desc b_md({group_size[0], k[0], n[0]}, MklDnnType<T>(), b_strides);
+  dnnl::desc c_md({group_size[0], m[0], n[0]}, MklDnnType<T>(), c_strides);
+  // Create attributes (to handle alpha and beta if necessary)
+  dnnl::primitive_attr attr;
+  if (alpha[0] != 1.f) attr.set_output_scales(/* mask */ 0, {alpha[0]});
+  if (beta[0] != 0.f) {
+    mkldnn::post_ops po;
+    po.append_sum(beta[0]);
+    attr.set_post_ops(po);
+  }
+  dnnl_gemm_exec(a_md, b_md, c_md, static_cast<void*>(a), static_cast<void*>(b),
+                 static_cast<void*>(c), attr);
+}
+
+template <typename T>
+void dnnl_gemm(char transa, char transb, int64_t m, int64_t n, int64_t k,
+               float alpha, const T* a, int64_t lda, const T* b, int64_t ldb,
+               float beta, T* c, int64_t ldc) {
+  using dims = dnnl::memory::dims;
+  // Prepare strides based on the transa and transb flags: transposed
+  // matrices have strides swapped
+  dims a_strides = tolower(transa) == 'n' ? dims{lda, 1} : dims{1, lda};
+  dims b_strides = tolower(transb) == 'n' ? dims{ldb, 1} : dims{1, ldb};
+  // Prepare memory descriptors
+  dnnl::desc a_md({m, k}, MklDnnType<T>(), a_strides);
+  dnnl::desc b_md({k, n}, MklDnnType<T>(), b_strides);
+  dnnl::desc c_md({m, n}, MklDnnType<T>(), {ldc, 1});
+  // Create attributes (to handle alpha and beta if necessary)
+  dnnl::primitive_attr attr;
+  if (alpha != 1.f) attr.set_output_scales(/* mask */ 0, {alpha});
+  if (beta != 0.f) {
+    mkldnn::post_ops po;
+    po.append_sum(beta);
+    attr.set_post_ops(po);
+  }
+  dnnl_gemm_exec(a_md, b_md, c_md, static_cast<void*>(a), static_cast<void*>(b),
+                 static_cast<void*>(c), attr);
+}
+#endif  // ENABLE_MKLDNN_V1
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_qmatmul_op.cc b/tensorflow/core/kernels/mkl_qmatmul_op.cc
index f9f199547ed..311eeeb5221 100644
--- a/tensorflow/core/kernels/mkl_qmatmul_op.cc
+++ b/tensorflow/core/kernels/mkl_qmatmul_op.cc
@@ -196,7 +196,8 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Toutput> {
       // Describe how the inputs and outputs of inner-product look like. Also
       // specify buffers containing actual input and output data.
       Tensor* dst_tensor = nullptr;
-      auto input_output_fmt = memory::format::nc;
+      auto input_output_fmt = MEMORY_FORMAT::nc;
+      auto input_output_fmt_mkldnn = MKL_TENSOR_FORMAT_NC;
 
       // If input is in MKL layout, then simply take input layout; otherwise,
       // construct input TF layout. For TF layout, although input shape
@@ -213,7 +214,7 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Toutput> {
       auto weight_md = weight_mkl_shape.IsMklTensor()
                            ? weight_mkl_shape.GetMklLayout()
                            : memory::desc(weight_dims, MklDnnType<Tweight>(),
-                                          memory::format::io);
+                                          MEMORY_FORMAT::io);
       weight.SetUsrMem(weight_md, &weight_tensor);
 
       MklDnnMatMulFwdPrimitive<float, Tinput, Tweight, Tbias, Toutput>*
@@ -235,16 +236,21 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Toutput> {
       std::shared_ptr<mkldnn::inner_product_forward::primitive_desc>
           matmul_fwd_pd = matmul_fwd->GetPrimitiveDesc();
       this->AllocateOutputTensor(context, *matmul_fwd_pd, dst_dims_mkl_order,
-                                 input_output_fmt, &dst_tensor);
+                                 input_output_fmt_mkldnn, &dst_tensor);
 
       Toutput* dst_data =
           reinterpret_cast<Toutput*>(dst_tensor->flat<Toutput>().data());
 
       // Check if src and weight data need to be reordered.
       Tinput* src_data = nullptr;
+#ifdef ENABLE_MKLDNN_V1
+      if (IS_SRC_REORDER_NEEDED(src_md, matmul_fwd_pd, matmul_fwd)) {
+#else
       if (src_md.data.format != matmul_fwd->GetSrcMemoryFormat()) {
+#endif
         src.SetUsrMem(src_md, &src_tensor);
-        src.CheckReorderToOpMem(matmul_fwd_pd.get()->src_primitive_desc());
+        src.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
+            matmul_fwd_pd.get()->PRIMITIVE_DESC_SRC, this->cpu_engine_));
         src_data = static_cast<Tinput*>(src.GetOpMem().get_data_handle());
       } else {
         src_data = static_cast<Tinput*>(
@@ -252,7 +258,11 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Toutput> {
       }
 
       Tweight* weight_data = nullptr;
+#ifdef ENABLE_MKLDNN_V1
+      if (IS_WEIGHTS_REORDER_NEEDED(weight_md, matmul_fwd_pd, matmul_fwd)) {
+#else
       if (weight_md.data.format != matmul_fwd->GetweightMemoryFormat()) {
+#endif
         bool is_weight_cached = false;
         // For batch size 1, MKL-DNN expects that weight format is OI whereas
         // TF default format is IO. So in that case convert weight from IO
@@ -263,17 +273,22 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Toutput> {
           if (IsWeightCacheEmpty(context)) {
             // Cache weight if it is not cached.
             CacheWeight(context, matmul_fwd_pd, weight_data, weight_tensor,
-                        weight, weight_md);
+                        weight, weight_md, weight_mkl_shape);
           }
-          weight_data =
-              GetCachedWeight(context, matmul_fwd->GetweightMemoryFormat());
+#ifdef ENABLE_MKLDNN_V1
+          weight_data = GetCachedWeight(
+              context, static_cast<int32>(weight_mkl_shape.GetTfDataFormat()));
+#else
+          weight_data = GetCachedWeight(
+              context, static_cast<int32>(matmul_fwd->GetweightMemoryFormat()));
+#endif
           is_weight_cached = (weight_data != nullptr);
         }
 
         if (!is_weight_cached) {
           weight.SetUsrMem(weight_md, &weight_tensor);
-          weight.CheckReorderToOpMem(
-              matmul_fwd_pd.get()->weights_primitive_desc());
+          weight.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
+              matmul_fwd_pd.get()->PRIMITIVE_DESC_WEIGHTS, this->cpu_engine_));
           weight_data =
               static_cast<Tweight*>(weight.GetOpMem().get_data_handle());
         }
@@ -432,19 +447,35 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Toutput> {
         std::vector<float> scales;
         scales.push_back(out_scale);
         mkldnn::primitive_attr bias_attr;
+        stream reorder_stream = CPU_STREAM(this->cpu_engine_);
         bias_attr.set_output_scales(0, scales);
 
         void* bias_buf = static_cast<void*>(
             const_cast<Tbias*>(bias_tensor.flat<Tbias>().data()));
         input_bias_ =
-            new memory(mkldnn_matmul_fwd_pd->bias_primitive_desc(), bias_buf);
-        scaled_bias_ = new memory(mkldnn_matmul_fwd_pd->bias_primitive_desc());
+            new MEMORY_CONSTRUCTOR(mkldnn_matmul_fwd_pd->PRIMITIVE_DESC_BIAS,
+                                   this->cpu_engine_, bias_buf);
+        scaled_bias_ = new MEMORY_CONSTRUCTOR_WITHOUT_DATA(
+            mkldnn_matmul_fwd_pd->PRIMITIVE_DESC_BIAS, this->cpu_engine_);
+
+#ifdef ENABLE_MKLDNN_V1
+        auto reorder_desc = mkldnn::reorder::primitive_desc(
+            *input_bias_, *scaled_bias_, bias_attr);
+        net.push_back(mkldnn::reorder(reorder_desc));
+        std::unordered_map<int, memory> reorder_net_args = {
+            {MKLDNN_ARG_FROM, *input_bias_},
+            { MKLDNN_ARG_TO,
+              *scaled_bias_ }};
+        net.at(0).execute(reorder_stream, reorder_net_args);
+#else
         auto reorder_desc = mkldnn::reorder::primitive_desc(
             input_bias_->get_primitive_desc(),
             scaled_bias_->get_primitive_desc(), bias_attr);
         net.push_back(
             mkldnn::reorder(reorder_desc, *input_bias_, *scaled_bias_));
-        stream(stream::kind::eager).submit(net).wait();
+        reorder_stream.submit(net).wait();
+#endif
+
         return reinterpret_cast<Tbias*>(scaled_bias_->get_data_handle());
       } else {
         context->CtxFailure(
@@ -483,8 +514,8 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Toutput> {
       const std::shared_ptr<mkldnn::inner_product_forward::primitive_desc>&
           matmul_fwd_pd,
       Tweight* weight_data, const Tensor& weight_tensor,
-      MklDnnData<Tweight>& weight, const memory::desc& weight_md)
-      LOCKS_EXCLUDED(mu_) {
+      MklDnnData<Tweight>& weight, const memory::desc& weight_md,
+      const MklDnnShape& weight_mkl_shape) LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
     const Tensor& weight_t = *weight_oi.AccessTensor(context);
 
@@ -495,14 +526,15 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Toutput> {
 
     // Reorder and cache the weight
     weight.SetUsrMem(weight_md, &weight_tensor);
-    weight.CheckReorderToOpMem(matmul_fwd_pd.get()->weights_primitive_desc());
+    weight.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
+        matmul_fwd_pd.get()->PRIMITIVE_DESC_WEIGHTS, this->cpu_engine_));
     weight_data = static_cast<Tweight*>(weight.GetOpMem().get_data_handle());
 
     Tensor* weight_tensor_ptr = nullptr;
 
     TensorShape weight_tf_shape;
     weight_tf_shape.AddDim(
-        (matmul_fwd_pd.get()->weights_primitive_desc().get_size() /
+        (GET_WEIGHTS_DESC_FROM_OP_PD(matmul_fwd_pd).get_size() /
          sizeof(Tweight)));
 
     OP_REQUIRES_OK(context, context->allocate_persistent(
@@ -510,7 +542,7 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Toutput> {
                                 &weight_oi, &weight_tensor_ptr));
 
     void* weight_oi_t_data = weight.GetTensorBuffer(weight_tensor_ptr);
-    size_t weight_size = weight.GetOpMem().get_primitive_desc().get_size();
+    size_t weight_size = GET_WEIGHTS_DESC_FROM_OP_PD(matmul_fwd_pd).get_size();
     memcpy(weight_oi_t_data, weight_data, weight_size);
 
     // Cache the memory descriptor
@@ -522,12 +554,17 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Toutput> {
     OP_REQUIRES_OK(context, context->allocate_persistent(
                                 DT_INT32, weight_mkl_format, &weight_oi_md,
                                 &weight_md_tensor_ptr));
+#ifdef ENABLE_MKLDNN_V1
+    // Using the logic from filter caching in mkl_conv_ops.cc
+    weight_md_tensor_ptr->scalar<int32>()() =
+        static_cast<int32>(weight_mkl_shape.GetTfDataFormat());
+#else
     weight_md_tensor_ptr->scalar<int32>()() =
         matmul_fwd_pd.get()->weights_primitive_desc().desc().data.format;
+#endif  // ENABLE_MKLDNN_V1
   }
 
-  Tweight* GetCachedWeight(OpKernelContext* context,
-                           const memory::format& weight_mf)
+  Tweight* GetCachedWeight(OpKernelContext* context, int32 weight_mf)
       LOCKS_EXCLUDED(mu_) {
     tf_shared_lock lock(mu_);
     const Tensor& weight_t = *weight_oi.AccessTensor(context);

From fd479417d517603823279fcbf724bf8be4694128 Mon Sep 17 00:00:00 2001
From: tigertang <tigertang.zju@gmail.com>
Date: Fri, 14 Feb 2020 13:31:09 +0800
Subject: [PATCH 077/442] Fix a typo in imagenet run_eval readme

---
 .../evaluation/tasks/imagenet_image_classification/README.md    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md
index ef8142e3d5d..bab96be53cc 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md
@@ -151,7 +151,7 @@ bazel build -c opt \
 directory if required):
 
 ```
-adb push bazel-bin/third_party/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval /data/local/tmp
+adb push bazel-bin/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval /data/local/tmp
 ```
 
 (3) Make the binary executable.

From 60da3fbda7e6a0c0a84b6bac168c3b06ced04d01 Mon Sep 17 00:00:00 2001
From: "Li, Guizi" <guizi.li@intel.com>
Date: Fri, 14 Feb 2020 13:47:29 +0800
Subject: [PATCH 078/442] [Intel MKL] Fix dequantize accuracy issue and
 re-enable this OP

---
 tensorflow/core/graph/mkl_layout_pass.cc      | 32 ++++----
 tensorflow/core/kernels/BUILD                 |  2 +
 tensorflow/core/kernels/mkl_dequantize_op.cc  | 16 ++--
 .../core/kernels/mkl_dequantize_op_test.cc    | 81 +++++++++++++++++++
 tensorflow/core/kernels/mkl_reshape_op.cc     | 81 ++++++-------------
 tensorflow/core/ops/mkl_array_ops.cc          |  3 +
 tensorflow/core/util/mkl_util.h               | 15 ++--
 7 files changed, 146 insertions(+), 84 deletions(-)

diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 33b66848081..0b765e22d38 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -359,9 +359,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.mul = "Mul";
     csinfo_.squared_difference = "SquaredDifference";
     csinfo_.sub = "Sub";
-// End - element-wise ops. See note above.
+    // End - element-wise ops. See note above.
 
-// NOTE: names are alphabetically sorted.
+    // NOTE: names are alphabetically sorted.
     rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn),
                       CopyAttrsAll, AlwaysRewrite,
                       kRewriteForLayoutPropagation});
@@ -671,18 +671,18 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back(
         {csinfo_.requantize, mkl_op_registry::GetMklOpName(csinfo_.requantize),
          CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
-// Disable these two MKL operators for now due to some test failures caused
-// by these two ops
-/*
-rinfo_.push_back({csinfo_.tanh,
-                  mkl_op_registry::GetMklOpName(csinfo_.tanh),
-                  CopyAttrsAll, AlwaysRewrite,
-                  kRewriteForLayoutPropagation});
-rinfo_.push_back({csinfo_.tanh_grad,
-                  mkl_op_registry::GetMklOpName(csinfo_.tanh_grad),
-                  CopyAttrsAll, AlwaysRewrite,
-                  kRewriteForLayoutPropagation});
-*/
+    // Disable these two MKL operators for now due to some test failures caused
+    // by these two ops
+    /*
+    rinfo_.push_back({csinfo_.tanh,
+                      mkl_op_registry::GetMklOpName(csinfo_.tanh),
+                      CopyAttrsAll, AlwaysRewrite,
+                      kRewriteForLayoutPropagation});
+    rinfo_.push_back({csinfo_.tanh_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.tanh_grad),
+                      CopyAttrsAll, AlwaysRewrite,
+                      kRewriteForLayoutPropagation});
+    */
     rinfo_.push_back(
         {csinfo_.reshape, mkl_op_registry::GetMklOpName(csinfo_.reshape),
          CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
@@ -1478,9 +1478,7 @@ rinfo_.push_back({csinfo_.tanh_grad,
                  "Eigen op for Dequantize op.";
       return false;
     }
-    // TODO(sriniva2/mabuzain) Enable the op after verifying support for
-    // object detection models
-    return false;
+    return true;
   }
 
   // Rewrite rule for _FusedMatMul.
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 409f52db948..f72236e07a1 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -7976,6 +7976,7 @@ tf_cc_test_mkl(
     srcs = ["mkl_dequantize_op_test.cc"],
     deps = [
         ":mkl_dequantize_op",
+        ":mkl_tfconv_op",
         ":ops_testutil",
         ":ops_util",
         "//tensorflow/core:array_ops_op_lib",
@@ -7984,6 +7985,7 @@ tf_cc_test_mkl(
         "//tensorflow/core:mkl_array_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
diff --git a/tensorflow/core/kernels/mkl_dequantize_op.cc b/tensorflow/core/kernels/mkl_dequantize_op.cc
index 4c9dbf4274a..02aaf9ee798 100644
--- a/tensorflow/core/kernels/mkl_dequantize_op.cc
+++ b/tensorflow/core/kernels/mkl_dequantize_op.cc
@@ -92,10 +92,12 @@ class MklDequantizeOp : public OpKernel {
 
       memory::primitive_desc src_pd =
           memory::primitive_desc(src_md, cpu_engine);
-      memory::desc dst_md = src_mkl_shape.IsMklTensor()
-                                ? src_md
-                                : memory::desc(src_dims, MklDnnType<float>(),
-                                               memory::format::nhwc);
+      memory::desc dst_md =
+          src_mkl_shape.IsMklTensor()
+              ? memory::desc(src_dims, MklDnnType<float>(),
+                             static_cast<memory::format>(src_md.data.format))
+              : memory::desc(src_dims, MklDnnType<float>(),
+                             memory::format::nhwc);
       memory::primitive_desc dst_pd =
           memory::primitive_desc(dst_md, cpu_engine);
 
@@ -150,9 +152,9 @@ class MklDequantizeOp : public OpKernel {
           mkldnn::reorder(reorder_pd, *src.GetUsrMem(), *dst.GetUsrMem()));
       stream(stream::kind::eager).submit(net).wait();
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           ctx, errors::Aborted("Operation received an exception:", error_msg));
     }
diff --git a/tensorflow/core/kernels/mkl_dequantize_op_test.cc b/tensorflow/core/kernels/mkl_dequantize_op_test.cc
index 23d59ef7ab6..3093b87fb95 100644
--- a/tensorflow/core/kernels/mkl_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/mkl_dequantize_op_test.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
+#include "tensorflow/core/util/mkl_util.h"
+
 namespace tensorflow {
 
 class MklDequantizeOpTest : public OpsTestBase {};
@@ -59,4 +61,83 @@ TEST_F(MklDequantizeOpTest, small) {
   test::ExpectTensorNear<float>(expected, output, 0.1);
 }
 
+Tensor CreateMklInput() {
+  MklDnnShape mkl_shape;
+  memory::desc md =
+      memory::desc({1, 2, 2, 2}, MklDnnType<uint8>(), memory::format::nhwc);
+  mkl_shape.SetMklTensor(true);
+  mkl_shape.SetMklLayout(&md);
+  mkl_shape.SetElemType(MklDnnType<uint8>());
+  mkl_shape.SetTfLayout(4, {1, 2, 2, 2}, memory::format::nhwc);
+
+  DataType dtype = DataTypeToEnum<uint8>::v();
+  Tensor mkl_tensor(dtype, {mkl_shape.GetSerializeBufferSize()});
+  mkl_shape.SerializeMklDnnShape(
+      mkl_tensor.flat<uint8>().data(),
+      mkl_tensor.flat<uint8>().size() * sizeof(uint8));
+  return mkl_tensor;
+}
+
+template <typename T>
+class CommonTestUtilities : public OpsTestBase {
+ public:
+  void MklToTF(const Tensor& tensor, const Tensor& mkl_meta_tensor,
+               Tensor* output) {
+    // Create an MKL to TF conversion node and execute it
+    TF_ASSERT_OK(NodeDefBuilder("mkl_to_tf_op", "_MklToTf")
+                     .Input(FakeInput(DataTypeToEnum<T>::v()))
+                     .Input(FakeInput(DT_UINT8))  // MKL second tensor
+                     .Attr("T", DataTypeToEnum<T>::v())
+                     .Attr("_kernel", "MklLayoutDependentOp")
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+    AddInputFromArray<T>(tensor.shape(), tensor.flat<T>());
+    AddInputFromArray<uint8>(mkl_meta_tensor.shape(),
+                             mkl_meta_tensor.flat<uint8>());
+    TF_ASSERT_OK(RunOpKernel());
+
+    *output = *GetOutput(0);
+  }
+
+  void ConvertAndCompare(const Tensor& tensor, const Tensor& mkl_meta_tensor,
+                         const Tensor& expected) {
+    Tensor output;
+    MklToTF(tensor, mkl_meta_tensor, &output);
+    test::ExpectTensorNear<T>(expected, output, 0.1);
+  }
+
+  void TestBody() {}
+};
+
+TEST_F(MklDequantizeOpTest, MKLInput) {
+  TF_ASSERT_OK(NodeDefBuilder("dequantize_op", "_MklDequantize")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_UINT8))  // MKL second tensor
+                   .Input(FakeInput(DT_UINT8))  // MKL second tensor
+                   .Input(FakeInput(DT_UINT8))  // MKL second tensor
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("mode", "SCALED")
+                   .Attr("_kernel", "QuantizedMklOp")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<quint8>(TensorShape({1, 2, 2, 2}),
+                            {0, 10, 50, 40, 25, 115, 190, 255});
+  // min_range = 0
+  AddInputFromArray<float>(TensorShape({1}), {0});
+  // max_range = 200
+  AddInputFromArray<float>(TensorShape({1}), {200.0f});
+  auto mkl_tensor = CreateMklInput();
+  AddInputFromArray<uint8>(mkl_tensor.shape(), mkl_tensor.flat<uint8>());
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 2, 2}));
+  test::FillValues<float>(&expected,
+                          {0.0, 7.84, 39.21, 31.37, 19.6, 90.2, 149.0, 200});
+  CommonTestUtilities<float> test_util;
+  test_util.ConvertAndCompare(*GetOutput(0), *GetOutput(1), expected);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index 3c95a37ecfd..ddb2548b99b 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -132,7 +132,7 @@ class MklReshapeOp : public OpKernel {
                                 " values, but the requested shape has ",
                                 shape.num_elements()));
 
-    if (input_in_mkl_format) {
+    if (input_in_mkl_format && !SkipReorder(mkl_shape_input, shape)) {
       TensorShape& shape_to = shape;
       TensorShape shape_from = mkl_shape_input.GetTfShape();
       if (shape_from == shape_to) {
@@ -152,65 +152,36 @@ class MklReshapeOp : public OpKernel {
           // Tensorflow, we don't need to reorder tensor contents, we just
           // need to update MklDnnShape object associated with the input
           // tensor to reflect the shape change expected by reshape.
-          if (!SkipReorder(mkl_shape_input, shape_to)) {
-            // If dimensions that are being expanded or collapsed are not
-            // maintained contiguously by MKLDNN, then we use reorder.
+          // If dimensions that are being expanded or collapsed are not
+          // maintained contiguously by MKLDNN, then we use reorder.
 
-            // Get Mkl layout of input tensor.
-            auto input_mkl_md = mkl_shape_input.GetMklLayout();
-            // Set input Mkl layout as the user layout.
-            dnn_data_input.SetUsrMem(input_mkl_md, &input_tensor);
-            // Get expected Tensorflow layout of input tensor.
-            auto output_tf_md = mkl_shape_input.GetTfLayout();
-            auto output_tf_pd =
-                memory::primitive_desc(output_tf_md, cpu_engine);
+          // Get Mkl layout of input tensor.
+          auto input_mkl_md = mkl_shape_input.GetMklLayout();
+          // Set input Mkl layout as the user layout.
+          dnn_data_input.SetUsrMem(input_mkl_md, &input_tensor);
+          // Get expected Tensorflow layout of input tensor.
+          auto output_tf_md = mkl_shape_input.GetTfLayout();
+          auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine);
 
-            Tensor* output_tensor = nullptr;
-            MklDnnShape mkl_shape_output;
-            mkl_shape_output.SetMklTensor(false);
-            // We allocate output tensor in the shape expected by Reshape.
-            AllocateOutputSetMklShape(context, kOutputSlotIdx, &output_tensor,
-                                      shape_to, mkl_shape_output);
+          Tensor* output_tensor = nullptr;
+          MklDnnShape mkl_shape_output;
+          mkl_shape_output.SetMklTensor(false);
+          // We allocate output tensor in the shape expected by Reshape.
+          AllocateOutputSetMklShape(context, kOutputSlotIdx, &output_tensor,
+                                    shape_to, mkl_shape_output);
 
-            // Insert reorder between Mkl layout and TensorFlow layout if
-            // needed. If reorder is not needed but reshape is needed (since
-            // shape_from != shape_to), then we just copy input tensor to
-            // output tensor with target shape (we cannot forward Mkl layout
-            // in such case because shape has changed.)
-            if (dnn_data_input.CheckReorderToOpMem(output_tf_pd,
-                                                   output_tensor)) {
-            } else {
-              OP_REQUIRES(
-                  context, output_tensor->CopyFrom(input_tensor, shape_to),
-                  errors::InvalidArgument("invalid input tensor shape"));
-            }
-            return;
+          // Insert reorder between Mkl layout and TensorFlow layout if
+          // needed. If reorder is not needed but reshape is needed (since
+          // shape_from != shape_to), then we just copy input tensor to
+          // output tensor with target shape (we cannot forward Mkl layout
+          // in such case because shape has changed.)
+          if (dnn_data_input.CheckReorderToOpMem(output_tf_pd, output_tensor)) {
           } else {
-            // If dimensions that are being expanded or collapsed are
-            // maintained contiguously by MKLDNN, then we skip reorder, just
-            // update MklDnnShape object for the tensorflow tensor, and forward
-            // Tensorflow tensor as it is to the output.
-            auto output_dims = TFShapeToMklDnnDims(shape_to);
-            auto output_strides = CalculateTFStrides(output_dims);
-            auto output_tf_md = MklDnnData<T>::CreateBlockedMemDesc(
-                output_dims, output_strides);
-            auto output_tf_pd =
-                memory::primitive_desc(output_tf_md, cpu_engine);
-
-            // Set MklDnnShape
-            MklDnnShape mkl_shape_output;
-            mkl_shape_output.SetMklTensor(true);
-            mkl_shape_output.SetMklLayout(&output_tf_pd);
-            mkl_shape_output.SetElemType(MklDnnType<T>());
-            mkl_shape_output.SetTfLayout(output_dims.size(), output_dims,
-                                         memory::format::blocked);
-
-            // We now simply forward input Mkl tensor to output and change its
-            // output MklDnnShape object.
-            ForwardMklTensorInToOutWithMklShape(
-                context, kInputSlotIdx, kOutputSlotIdx, mkl_shape_output);
-            return;
+            OP_REQUIRES(context,
+                        output_tensor->CopyFrom(input_tensor, shape_to),
+                        errors::InvalidArgument("invalid input tensor shape"));
           }
+          return;
         } catch (mkldnn::error& e) {
           string error_msg = "Status: " + std::to_string(e.status) +
                              ", message: " + string(e.message) + ", in file " +
diff --git a/tensorflow/core/ops/mkl_array_ops.cc b/tensorflow/core/ops/mkl_array_ops.cc
index d4908f881e9..4e58711ccad 100644
--- a/tensorflow/core/ops/mkl_array_ops.cc
+++ b/tensorflow/core/ops/mkl_array_ops.cc
@@ -142,7 +142,10 @@ REGISTER_OP("_MklDequantize")
     .Output("output: float")
     .Output("mkl_output: uint8")
     .Attr("T: quantizedtype")
+    .Attr("narrow_range: bool = false")
+    .Attr("axis: int = -1")
     .Attr("mode: {'MIN_COMBINED', 'MIN_FIRST', 'SCALED'} = 'SCALED'")
+    .Attr("dtype: {bfloat16, float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::UnchangedShape(c));
       ShapeHandle unused;
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index e4450ee8a56..34183e48a6d 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -728,9 +728,9 @@ inline Status ConvertMklToTF(OpKernelContext* context,
     }
     return Status::OK();
   } catch (mkldnn::error& e) {
-    string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) + ", in file " +
-                       string(__FILE__) + ":" + std::to_string(__LINE__);
+    string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                       string(e.message) + ", in file " + string(__FILE__) +
+                       ":" + std::to_string(__LINE__);
     LOG(FATAL) << "Operation received an exception: " << error_msg;
   }
 }
@@ -1011,6 +1011,11 @@ memory::data_type MklDnnType<quint8>() {
   return memory::data_type::u8;
 }
 
+template <>
+memory::data_type MklDnnType<uint8>() {
+  return memory::data_type::u8;
+}
+
 template <>
 memory::data_type MklDnnType<qint8>() {
   return memory::data_type::s8;
@@ -1250,8 +1255,8 @@ inline Status CreateBlockedMemDescHelper(const memory::dims& dim,
   } catch (mkldnn::error& e) {
     return Status(error::Code::INTERNAL,
                   tensorflow::strings::StrCat(
-                      "Failed to create blocked memory descriptor.",
-                      "Status: ", e.status, ", message: ", e.message));
+                      "Failed to create blocked memory descriptor.", "Status: ",
+                      e.status, ", message: ", e.message));
   }
 #else
   // We have to construct memory descriptor in a C style. This is not at all

From 19ecdb017ac37c5fb62d30ec1f8ad28b341228d5 Mon Sep 17 00:00:00 2001
From: Judd <foldl@outlook.com>
Date: Fri, 14 Feb 2020 14:33:34 +0800
Subject: [PATCH 079/442] Update accuracy_utils.py

fix v2 compatibility.
---
 tensorflow/examples/speech_commands/accuracy_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/examples/speech_commands/accuracy_utils.py b/tensorflow/examples/speech_commands/accuracy_utils.py
index dd5a12c2087..11814d70cd8 100755
--- a/tensorflow/examples/speech_commands/accuracy_utils.py
+++ b/tensorflow/examples/speech_commands/accuracy_utils.py
@@ -137,14 +137,14 @@ class StreamingAccuracyStats(object):
   def print_accuracy_stats(self):
     """Write a human-readable description of the statistics to stdout."""
     if self._how_many_gt == 0:
-      tf.logging.info('No ground truth yet, {}false positives'.format(
+      tf.compat.v1.logging.info('No ground truth yet, {}false positives'.format(
           self._how_many_fp))
     else:
       any_match_percentage = self._how_many_gt_matched / self._how_many_gt * 100
       correct_match_percentage = self._how_many_c / self._how_many_gt * 100
       wrong_match_percentage = self._how_many_w / self._how_many_gt * 100
       false_positive_percentage = self._how_many_fp / self._how_many_gt * 100
-      tf.logging.info('{:.1f}% matched, {:.1f}% correct, {:.1f}% wrong, '
+      tf.compat.v1.logging.info('{:.1f}% matched, {:.1f}% correct, {:.1f}% wrong, '
                       '{:.1f}% false positive'.format(
                           any_match_percentage, correct_match_percentage,
                           wrong_match_percentage, false_positive_percentage))

From 8e3fc979820943f049e744b548161da995cd7eea Mon Sep 17 00:00:00 2001
From: Judd <foldl@outlook.com>
Date: Fri, 14 Feb 2020 14:36:43 +0800
Subject: [PATCH 080/442] Update test_streaming_accuracy.py

fix v2 compatibility.
---
 .../test_streaming_accuracy.py                | 35 +++++++++----------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/tensorflow/examples/speech_commands/test_streaming_accuracy.py b/tensorflow/examples/speech_commands/test_streaming_accuracy.py
index 4b7fa717348..d4bf43b552b 100755
--- a/tensorflow/examples/speech_commands/test_streaming_accuracy.py
+++ b/tensorflow/examples/speech_commands/test_streaming_accuracy.py
@@ -69,10 +69,9 @@ import sys
 import numpy
 import tensorflow as tf
 
-from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
-from tensorflow.examples.speech_commands.accuracy_utils import StreamingAccuracyStats
-from tensorflow.examples.speech_commands.recognize_commands import RecognizeCommands
-from tensorflow.examples.speech_commands.recognize_commands import RecognizeResult
+from accuracy_utils import StreamingAccuracyStats
+from recognize_commands import RecognizeCommands
+from recognize_commands import RecognizeResult
 from tensorflow.python.ops import io_ops
 
 FLAGS = None
@@ -82,8 +81,8 @@ def load_graph(mode_file):
   """Read a tensorflow model, and creates a default graph object."""
   graph = tf.Graph()
   with graph.as_default():
-    od_graph_def = tf.GraphDef()
-    with tf.gfile.GFile(mode_file, 'rb') as fid:
+    od_graph_def = tf.compat.v1.GraphDef()
+    with tf.io.gfile.GFile(mode_file, 'rb') as fid:
       serialized_graph = fid.read()
       od_graph_def.ParseFromString(serialized_graph)
       tf.import_graph_def(od_graph_def, name='')
@@ -101,10 +100,10 @@ def read_label_file(file_name):
 
 def read_wav_file(filename):
   """Load a wav file and return sample_rate and numpy data of float64 type."""
-  with tf.Session(graph=tf.Graph()) as sess:
-    wav_filename_placeholder = tf.placeholder(tf.string, [])
+  with tf.compat.v1.Session(graph=tf.Graph()) as sess:
+    wav_filename_placeholder = tf.compat.v1.placeholder(tf.string, [])
     wav_loader = io_ops.read_file(wav_filename_placeholder)
-    wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1)
+    wav_decoder = tf.audio.decode_wav(wav_loader, desired_channels=1)
     res = sess.run(wav_decoder, feed_dict={wav_filename_placeholder: filename})
   return res.sample_rate, res.audio.flatten()
 
@@ -133,14 +132,14 @@ def main(_):
   # Load model and create a tf session to process audio pieces
   recognize_graph = load_graph(FLAGS.model)
   with recognize_graph.as_default():
-    with tf.Session() as sess:
+    with tf.compat.v1.Session() as sess:
 
       # Get input and output tensor
-      data_tensor = tf.get_default_graph().get_tensor_by_name(
+      data_tensor = sess.graph.get_tensor_by_name(
           FLAGS.input_names[0])
-      sample_rate_tensor = tf.get_default_graph().get_tensor_by_name(
+      sample_rate_tensor = sess.graph.get_tensor_by_name(
           FLAGS.input_names[1])
-      output_softmax_tensor = tf.get_default_graph().get_tensor_by_name(
+      output_softmax_tensor = sess.graph.get_tensor_by_name(
           FLAGS.output_name)
 
       # Inference along audio stream.
@@ -161,7 +160,7 @@ def main(_):
           recognize_commands.process_latest_result(outputs, current_time_ms,
                                                    recognize_element)
         except ValueError as e:
-          tf.logging.error('Recognition processing failed: {}' % e)
+          tf.compat.v1.logging.error('Recognition processing failed: {}' % e)
           return
         if (recognize_element.is_new_command and
             recognize_element.founded_command != '_silence_'):
@@ -173,10 +172,10 @@ def main(_):
             try:
               recognition_state = stats.delta()
             except ValueError as e:
-              tf.logging.error(
+              tf.compat.v1.logging.error(
                   'Statistics delta computing failed: {}'.format(e))
             else:
-              tf.logging.info('{}ms {}:{}{}'.format(
+              tf.compat.v1.logging.info('{}ms {}:{}{}'.format(
                   current_time_ms, recognize_element.founded_command,
                   recognize_element.score, recognition_state))
               stats.print_accuracy_stats()
@@ -249,5 +248,5 @@ if __name__ == '__main__':
       help='Whether to print streaming accuracy on stdout.')
 
   FLAGS, unparsed = parser.parse_known_args()
-  tf.logging.set_verbosity(tf.logging.INFO)
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
+  tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)

From e390647355548744b278f8f8dfd86eef4094b8e4 Mon Sep 17 00:00:00 2001
From: Judd <foldl@outlook.com>
Date: Fri, 14 Feb 2020 16:06:02 +0800
Subject: [PATCH 081/442] Update accuracy_utils.py

fix pylint warning/errors.
---
 tensorflow/examples/speech_commands/accuracy_utils.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tensorflow/examples/speech_commands/accuracy_utils.py b/tensorflow/examples/speech_commands/accuracy_utils.py
index 11814d70cd8..a2d050ad31b 100755
--- a/tensorflow/examples/speech_commands/accuracy_utils.py
+++ b/tensorflow/examples/speech_commands/accuracy_utils.py
@@ -144,7 +144,8 @@ class StreamingAccuracyStats(object):
       correct_match_percentage = self._how_many_c / self._how_many_gt * 100
       wrong_match_percentage = self._how_many_w / self._how_many_gt * 100
       false_positive_percentage = self._how_many_fp / self._how_many_gt * 100
-      tf.compat.v1.logging.info('{:.1f}% matched, {:.1f}% correct, {:.1f}% wrong, '
-                      '{:.1f}% false positive'.format(
-                          any_match_percentage, correct_match_percentage,
-                          wrong_match_percentage, false_positive_percentage))
+      tf.compat.v1.logging.info(
+          '{:.1f}% matched, {:.1f}% correct, {:.1f}% wrong, '
+          '{:.1f}% false positive'.format(
+              any_match_percentage, correct_match_percentage,
+              wrong_match_percentage, false_positive_percentage))

From f7c3540676beaef2125f1ea4b75ebf368930d082 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Fri, 14 Feb 2020 14:27:30 +0100
Subject: [PATCH 082/442] Fix comments

---
 .../tf2tensorrt/convert/convert_nodes.cc       |  9 +++------
 .../compiler/tf2tensorrt/convert/utils.h       |  3 ++-
 .../tf2tensorrt/kernels/trt_engine_op.cc       | 14 +++++++-------
 .../utils/trt_shape_optimization_profiles.cc   |  2 +-
 .../utils/trt_shape_optimization_profiles.h    | 18 +++++++++---------
 .../trt_shape_optimization_profiles_test.cc    |  2 +-
 6 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index a76e833bafe..4f875b62435 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -252,17 +252,14 @@ void GetInputProperties(const grappler::GraphProperties& graph_properties,
 
 // This function checks if a tensor is compatible with TRT.
 //
-// We check that the shape and datatype is compatible with TensorRT. We also
+// We check that the shape and datatype are compatible with TensorRT. We also
 // return the corresponding trt_dtype, the trt_dims and the batch_size (latter
 // is only needed in implicit batch mode).
 //
 // The return status indicates wether the tensor is compatible.
 //
-// If validation_only == false, then we make an additional check. In implicit
-// batch mode we check that all inputs for the network has static shape (as
-// required by the TensorRT). The only exception is the batch size, which
-// could be unknown. In contrast, using explicit batch mode this test is not
-// necessary, since any dimension could be unknown in explicit batch mode.
+// For implicit batch mode, when validation_only == false, we also check that
+// all input dimensions (besides the batch dimension) are known dimensions.
 Status ValidateTensorProperties(const string& producer_node_type,
                                 const DataType dtype,
                                 const PartialTensorShape& shape,
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.h b/tensorflow/compiler/tf2tensorrt/convert/utils.h
index bda01108341..40e446b131e 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.h
@@ -108,7 +108,8 @@ string GetLoadedTensorRTVersion();
 
 // Returns the number of inputs for the engine, which also correspends to the
 // number of input tensors for the network. This can differ from the number of
-// input bindings, because each profile has a set of bindings.
+// input bindings, because the number of total input bindings equals the number
+// of profiles times the number of engine inputs.
 int GetNumberOfEngineInputs(const nvinfer1::ICudaEngine *engine);
 
 #endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index ec2b423cd08..521b38341b0 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -93,7 +93,7 @@ class TRTEngineOp : public AsyncOpKernel {
       LRUCache<std::vector<TensorShape>, std::unique_ptr<EngineContext>,
                VectorTensorShapeHasher>;
 
-  // Execute calibration.
+  // Executes calibration.
   void ExecuteCalibration(OpKernelContext* ctx,
                           TRTEngineCacheResource* cache_res,
                           AsyncHelper* helper);
@@ -104,15 +104,15 @@ class TRTEngineOp : public AsyncOpKernel {
   Status ConstructFunctionHandle(FunctionLibraryRuntime* lib,
                                  const string& device_name);
 
-  // Execute replaced native segment as function Op.
+  // Executes replaced native segment as function Op.
   void ExecuteNativeSegment(OpKernelContext* ctx, AsyncHelper* helper);
 
-  // Execute the tensorrt engine. Returns whether we need to retry by running
+  // Executes the tensorrt engine. Returns whether we need to retry by running
   // the native segment.
   bool ExecuteTrtEngine(OpKernelContext* ctx, EngineContext* engine_context,
                         int trt_context_idx);
 
-  // Allocate necessary resources for calibration.
+  // Allocates necessary resources for calibration.
   Status AllocateCalibrationResources(OpKernelContext* ctx,
                                       TRTEngineCacheResource* cache_res);
 
@@ -598,9 +598,9 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
 
   if (!use_implicit_batch_) {
     if (cache_res->profiles_.GetNumProfiles() == 0) {
-      // Create a single profile from the current input shape.
-      // In the future we will collect a set of input shapes during build mode
-      // and create profiles for each of them.
+      // Create a single profile from the current input shape. In the future we
+      // will collect a set of input shapes during build mode and create
+      // profiles for each of them.
       cache_res->profiles_.AddShape(input_concrete_shapes);
       cache_res->profiles_.InitProfiles();
     }
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
index 4e4ad0a3649..60ceac2077d 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
@@ -21,7 +21,7 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 
-// Create optimization profiles for a list of input shapes. The list of input
+// Creates optimization profiles for a list of input shapes. The list of input
 // shapes are stored in shapes_.
 void TrtShapeOptimizationProfile::InitProfiles() {
   if (input_shapes_.size() == 0) {
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
index 5685acea15f..281692c8b08 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
@@ -57,7 +57,7 @@ struct OptimizationProfileConfig {
   }
 
 #if IS_TRT_VERSION_GE(6, 0, 0, 0)
-  // Set the stored min/opt/max dimensions for profile.
+  // Sets the stored min/opt/max dimensions for profile.
   //
   // Parameters:
   // network - TensorRT network, used to enumerate all the input tensors
@@ -81,15 +81,15 @@ struct OptimizationProfileConfig {
 
   // Returns true if profile range completely includes the given shapes.
   bool IncludesShapes(const std::vector<TensorShape>& shapes) const {
-    // min, max, and opt must have the same size which,
-    // already verified in SetDimensions.
+    // min, max, and opt must have the same size which is already verified in
+    // SetDimensions.
     if (min.size() != shapes.size()) {
       return false;
     }
     for (int i = 0; i < shapes.size(); i++) {
       auto current_shape = shapes[i];
-      // min, max, and opt must have the same nbDims, which is
-      // already verified in SetDimensions.
+      // min, max, and opt must have the same nbDims, which is already verified
+      // in SetDimensions.
       if (min[i].nbDims != current_shape.dims()) {
         return false;
       }
@@ -144,14 +144,14 @@ class TrtShapeOptimizationProfile {
       nvinfer1::ICudaEngine* engine,
       std::vector<TrtUniquePtrType<nvinfer1::IExecutionContext>>& exec_context);
 
-  /// Map input vector shapes to TRT Optimization profiles (min, max, opt)
-  // i.e. maps input_shapes_ to profiles_
+  // Maps input vector shapes to TRT Optimization profiles (min, max, opt) i.e.
+  // maps input_shapes_ to profiles_
   void InitProfiles();
 
   // Returns number of created profiles.
   int GetNumProfiles() const;
 
-  // Restore profiles from the engine (used after deserialization)
+  // Restores profiles from the engine (used after deserialization)
   Status RestoreProfiles(const nvinfer1::ICudaEngine* engine);
 
  private:
@@ -163,7 +163,7 @@ class TrtShapeOptimizationProfile {
   std::vector<OptimizationProfileConfig> profiles_;
 
 #if IS_TRT_VERSION_GE(6, 0, 0, 0)
-  /// Add optimization profiles to the builder config
+  /// Adds optimization profiles to the builder config
   Status AddProfiles(nvinfer1::IBuilder* builder,
                      nvinfer1::IBuilderConfig* config,
                      const nvinfer1::INetworkDefinition* network);
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
index 56a6c430279..8efd65cdce5 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
@@ -86,7 +86,7 @@ class TrtShapeOptimizationProfileTest : public ::testing::Test {
 #endif
   }
 
-  // Define a simple network: output = input1 + input2.
+  // Defines a simple network: output = input1 + input2.
   void DefineNetwork(nvinfer1::INetworkDefinition* network,
                      nvinfer1::Dims3& dims) {
     nvinfer1::ITensor* input1 =

From 5c3e81736c37942ea6684a4a424e97b2ba4208ab Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Fri, 14 Feb 2020 14:28:17 +0100
Subject: [PATCH 083/442] Disable explicit batch and dynamic shapes test for
 TRT5

---
 .../tf2tensorrt/kernels/trt_engine_op_test.cc        |  2 ++
 .../python/compiler/tensorrt/test/trt_mode_test.py   | 12 +++++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
index fd067064aac..8dda2489592 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
@@ -186,6 +186,7 @@ TEST_F(TRTEngineOpTestBase, DynamicEngines) {
   EXPECT_EQ(1, cache->count({TensorShape({10, 10})}));
 }
 
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
 TEST_F(TRTEngineOpTestBase, ExplicitBatch) {
   // Test inference in explicit batch mode with static input shapes. Static
   // shapes in this context means that the TensorRT knows all the input shapes
@@ -262,6 +263,7 @@ TYPED_TEST(TRTEngineOpTest, Basic) {
                                   output->NumElements()),
       ElementsAre(TypeParam(0.0f), TypeParam(2.0f)));
 }
+#endif
 
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py b/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
index c9ec88c2f52..415c16a114d 100644
--- a/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
@@ -20,13 +20,13 @@ from __future__ import print_function
 
 from unittest import SkipTest  # pylint: disable=g-importing-member
 
+from tensorflow.compiler.tf2tensorrt.wrap_py_utils import get_linked_tensorrt_version
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
-
 class TrtModeTestBase(trt_test.TfTrtIntegrationTestBase):
   """Test squeeze on batch dim and some unary operations in TF-TRT."""
 
@@ -122,6 +122,11 @@ class ExplicitBatchTest(TrtModeTestBase):
     """
     return ["TRTEngineOp_0"]
 
+  def ShouldRunTest(self, run_params):
+    # Only run for TRT 6 and above.
+    ver = get_linked_tensorrt_version()
+    return ver[0] >= 6
+
 
 class DynamicShapesTest(TrtModeTestBase):
   """Test with dynamic input shapes.
@@ -146,6 +151,11 @@ class DynamicShapesTest(TrtModeTestBase):
     """Return the expected engines to build."""
     return ["TRTEngineOp_0"]
 
+  def ShouldRunTest(self, run_params):
+    # Only run for TRT 6 and above.
+    ver = get_linked_tensorrt_version()
+    return ver[0] >= 6
+
 
 if __name__ == "__main__":
   test.main()

From bbef16d675efd91846374a86717f4b038ad81444 Mon Sep 17 00:00:00 2001
From: frreiss <frreiss@us.ibm.com>
Date: Fri, 14 Feb 2020 11:39:03 -0800
Subject: [PATCH 084/442] Address review comments

---
 tensorflow/python/data/ops/dataset_ops.py | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index c0137373be5..30f915322c6 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -1802,22 +1802,12 @@ name=None))
     fewer if there are not enough input elements to fill the window and
     `drop_remainder` evaluates to `False`).
 
-    The `shift` argument determines the number of input elements by which
-    the window moves on each iteration.  The first element in the `k`th window
-    will be element
-
-    ```
-    1 + (k-1) * shift
-    ```
-
+    The `shift` argument determines the number of input elements by which the
+    window moves on each iteration.  If windows and elements are both numbered
+    starting at 0, the first element in window `k` will be element `k * shift`
     of the input dataset. In particular, the first element of the first window
     will always be the first element of the input dataset.
 
-    If the `stride` parameter is greater than 1, then each window will skip
-    `(stride - 1)` input elements between each element that appears in the
-    window. Output windows will still contain `size` elements regardless of
-    the value of `stride`.
-
     The `stride` argument determines the stride of the input elements, and the
     `shift` argument determines the shift of the window.
 

From 16a10ea5f97ed2c7e0a7132380e355a35a4b9afc Mon Sep 17 00:00:00 2001
From: mdfaijul <md.faijul.amin@intel.com>
Date: Sun, 9 Feb 2020 22:41:19 -0800
Subject: [PATCH 085/442] Added support for MKLDNN 1.x for QuantizeOpV2 and
 DequantizeOp.

---
 tensorflow/core/kernels/mkl_dequantize_op.cc |  54 +++--
 tensorflow/core/kernels/mkl_quantize_op.cc   | 203 ++++++++-----------
 tensorflow/core/util/mkl_types.h             |   3 +
 tensorflow/core/util/mkl_util.h              |   8 +-
 4 files changed, 122 insertions(+), 146 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_dequantize_op.cc b/tensorflow/core/kernels/mkl_dequantize_op.cc
index 4c9dbf4274a..2e046bf85bb 100644
--- a/tensorflow/core/kernels/mkl_dequantize_op.cc
+++ b/tensorflow/core/kernels/mkl_dequantize_op.cc
@@ -17,18 +17,18 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#include "mkldnn.hpp"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/kernels/meta_support.h"
 #include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
-
-#include "tensorflow/core/graph/mkl_graph_util.h"
+#include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-#include "mkldnn.hpp"
 using mkldnn::primitive_attr;
 using mkldnn::stream;
 
@@ -51,7 +51,7 @@ class MklDequantizeOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     try {
       // Using CPU device
-      auto cpu_engine = engine(engine::cpu, 0);
+      auto cpu_engine = engine(ENGINE_CPU, 0);
 
       // Get the inputs
       const Tensor& src_tensor = MklGetInput(ctx, kSrcIndex);
@@ -82,33 +82,28 @@ class MklDequantizeOp : public OpKernel {
       auto src_md =
           src_mkl_shape.IsMklTensor()
               ? src_mkl_shape.GetMklLayout()
-              : memory::desc(src_dims, MklDnnType<T>(), memory::format::nhwc);
+              : memory::desc(src_dims, MklDnnType<T>(), MEMORY_FORMAT::nhwc);
 
       src.SetUsrMem(src_md, &src_tensor);
 
       Tensor* output_tensor = nullptr;
       MklDnnShape output_mkl_shape;
       TensorShape output_tf_shape;
-
-      memory::primitive_desc src_pd =
-          memory::primitive_desc(src_md, cpu_engine);
       memory::desc dst_md = src_mkl_shape.IsMklTensor()
                                 ? src_md
                                 : memory::desc(src_dims, MklDnnType<float>(),
-                                               memory::format::nhwc);
-      memory::primitive_desc dst_pd =
-          memory::primitive_desc(dst_md, cpu_engine);
-
+                                               MEMORY_FORMAT::nhwc);
       // If input is MKL shape, output is also MKL shape.
       // If input is TF shape, output is also TF shape.
       if (src_mkl_shape.IsMklTensor()) {
         output_mkl_shape.SetMklTensor(true);
-        output_mkl_shape.SetMklLayout(&dst_pd);
+        output_mkl_shape.SetMklLayout(&dst_md);
         output_mkl_shape.SetElemType(MklDnnType<float>());
         output_mkl_shape.SetTfLayout(src_mkl_shape.GetDimension(),
                                      src_mkl_shape.GetSizesAsMklDnnDims(),
                                      src_mkl_shape.GetTfDataFormat());
-        output_tf_shape.AddDim((dst_pd.get_size() / sizeof(float)));
+        output_tf_shape.AddDim(GET_MEMORY_SIZE_FROM_MD(dst_md, cpu_engine) /
+                               sizeof(float));
       } else {
         output_mkl_shape.SetMklTensor(false);
         output_tf_shape = MklDnnDimsToTFShape(output_dims);
@@ -135,20 +130,35 @@ class MklDequantizeOp : public OpKernel {
       const float target_range =
           static_cast<float>((uint64_t{1} << target_bits) - 1);
       const float scale_factor = max_abs / target_range;
-
       std::vector<float> scales;
       scales.push_back(scale_factor);
       primitive_attr attr;
       attr.set_output_scales(0, scales);
+#ifndef ENABLE_MKLDNN_V1
+      // MKL-DNN 1.0 does not provide set_int_output_round_mode() API.
+      // Also it does not define round_nearest (enum).
       attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
-      mkldnn::reorder::primitive_desc reorder_pd =
-          mkldnn::reorder::primitive_desc(src_pd, dst_pd, attr);
-
-      // Execute MKL-DNN primitive
+#endif  // !ENABLE_MKLDNN_V1
+      stream reorder_stream = CPU_STREAM(cpu_engine);
       std::vector<primitive> net;
-      net.push_back(
-          mkldnn::reorder(reorder_pd, *src.GetUsrMem(), *dst.GetUsrMem()));
-      stream(stream::kind::eager).submit(net).wait();
+
+      // Create reorder primitive and then execute.
+      auto reorder_pd = REORDER_PD_CONSTRUCTOR_WITH_ATTR(
+          GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(src.GetUsrMem()),
+          GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(dst.GetUsrMem()), cpu_engine,
+          attr);
+#ifdef ENABLE_MKLDNN_V1
+      net.push_back(reorder(reorder_pd));
+      std::vector<std::unordered_map<int, memory>> reorder_net_args;
+      reorder_net_args.push_back({{MKLDNN_ARG_FROM, *src.GetUsrMem()},
+                                  { MKLDNN_ARG_TO,
+                                    *dst.GetUsrMem() }});
+      execute_primitives(net, std::make_shared<stream>(reorder_stream),
+                         reorder_net_args);
+#else
+      net.push_back(reorder(reorder_pd, *src.GetUsrMem(), *dst.GetUsrMem()));
+      reorder_stream.submit(net);
+#endif  // ENABLE_MKLDNN_V1
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
diff --git a/tensorflow/core/kernels/mkl_quantize_op.cc b/tensorflow/core/kernels/mkl_quantize_op.cc
index 985f1cd8c88..d049b5f58d2 100644
--- a/tensorflow/core/kernels/mkl_quantize_op.cc
+++ b/tensorflow/core/kernels/mkl_quantize_op.cc
@@ -17,9 +17,7 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "mkldnn.h"
 #include "mkldnn.hpp"
-#include "mkldnn_types.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/type_traits.h"
@@ -27,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 using mkldnn::primitive_attr;
@@ -56,7 +55,6 @@ enum {
 }  // namespace
 
 namespace tensorflow {
-
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 struct MklReorderWithScaleFwdParams {
@@ -78,20 +76,28 @@ struct MklReorderWithScaleFwdParams {
 class MklReorderWithScalePrimitive : public MklPrimitive {
  public:
   explicit MklReorderWithScalePrimitive(
-      const memory* from, const memory* to,
-      const MklReorderWithScaleFwdParams& fwdParams) {
+      const MklReorderWithScaleFwdParams& fwdParams)
+      : cpu_engine_(ENGINE_CPU, 0) {
     // Create reorder primitive
-    Setup(from, to, fwdParams);
+    Setup(fwdParams);
   }
 
   ~MklReorderWithScalePrimitive() {}
 
   std::shared_ptr<primitive> GetPrimitive() { return context_.reorder_prim; }
 
-  // set data handles
-  void SetMemory(const memory* from, const memory* to) {
-    context_.src_mem->set_data_handle(from->get_data_handle());
-    context_.dst_mem->set_data_handle(to->get_data_handle());
+  void Execute(void* src_data, void* dst_data) {
+    context_.src_mem->set_data_handle(src_data);
+    context_.dst_mem->set_data_handle(dst_data);
+#ifndef ENABLE_MKLDNN_V1
+    context_.reorder_stream->submit(context_.net);
+#else
+    context_.reorder_prim->execute(*context_.reorder_stream,
+                                   context_.prim_args);
+#endif  // !ENABLE_MKLDNN_V1
+    // After execution, set data handle back.
+    context_.src_mem->set_data_handle(DummyData);
+    context_.dst_mem->set_data_handle(DummyData);
   }
 
  private:
@@ -101,41 +107,36 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
     std::shared_ptr<mkldnn::memory> src_mem;
     std::shared_ptr<mkldnn::memory> dst_mem;
 
-    // Memory desc
-    std::shared_ptr<mkldnn::memory::desc> src_md;
-    std::shared_ptr<mkldnn::memory::desc> dst_md;
-
-    // Memory primitive desc
-    std::shared_ptr<mkldnn::memory::primitive_desc> src_mpd;
-    std::shared_ptr<mkldnn::memory::primitive_desc> dst_mpd;
-
     // Reorder primitive descriptor and primitive
     std::shared_ptr<reorder::primitive_desc> reorder_pd;
     std::shared_ptr<primitive> reorder_prim;
 
+    // Stream and primitive vector
+    std::shared_ptr<mkldnn::stream> reorder_stream;
+
+#ifndef ENABLE_MKLDNN_V1
+    std::vector<mkldnn::primitive> net;
+#else
+    std::unordered_map<int, mkldnn::memory> prim_args;
+#endif  // !ENABLE_MKLDNN_V1
+
     ReorderContext()
         : src_mem(nullptr),
           dst_mem(nullptr),
-          src_md(nullptr),
-          dst_md(nullptr),
-          src_mpd(nullptr),
-          dst_mpd(nullptr),
           reorder_pd(nullptr),
-          reorder_prim(nullptr) {}
+          reorder_prim(nullptr),
+          reorder_stream(nullptr) {}
   } context_;
 
-  engine cpu_engine_ = engine(engine::cpu, 0);
+  engine cpu_engine_;
 
   // Reorder primitive setup
-  void Setup(const memory* from, const memory* to,
-             const MklReorderWithScaleFwdParams& fwdParams) {
+  void Setup(const MklReorderWithScaleFwdParams& fwdParams) {
     // Create memory descriptors for reorder data with specified format
-    context_.src_md.reset(new memory::desc(fwdParams.src_md.data));
-    context_.dst_md.reset(new memory::desc(fwdParams.dst_md.data));
-    context_.src_mpd.reset(
-        new memory::primitive_desc(*context_.src_md, cpu_engine_));
-    context_.dst_mpd.reset(
-        new memory::primitive_desc(*context_.dst_md, cpu_engine_));
+    context_.src_mem.reset(new MEMORY_CONSTRUCTOR_USING_MD(
+        fwdParams.src_md, cpu_engine_, DummyData));
+    context_.dst_mem.reset(new MEMORY_CONSTRUCTOR_USING_MD(
+        fwdParams.dst_md, cpu_engine_, DummyData));
 
     // Check if there is any fusion as post-ops
     auto const& post_op_params = fwdParams.post_op_params;
@@ -147,18 +148,22 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
     scales.push_back(post_op_params.param[0]);
     post_ops_attr.set_output_scales(0, scales);
 
-    // Create a reorder
-    context_.reorder_pd =
-        std::make_shared<reorder::primitive_desc>(reorder::primitive_desc(
-            *context_.src_mpd, *context_.dst_mpd, post_ops_attr));
+    context_.reorder_pd.reset(new REORDER_PD_CONSTRUCTOR_WITH_ATTR(
+        GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(context_.src_mem),
+        GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(context_.dst_mem), cpu_engine_,
+        post_ops_attr));
 
-    // Create memory primitive based on dummy data
-    context_.src_mem.reset(new memory(*context_.src_mpd, DummyData));
-    context_.dst_mem.reset(new memory(*context_.dst_mpd, DummyData));
-
-    // Create reorder primitive
-    context_.reorder_prim = std::make_shared<reorder>(
-        reorder(*context_.reorder_pd, *context_.src_mem, *context_.dst_mem));
+// Create reorder primitive
+#ifndef ENABLE_MKLDNN_V1
+    context_.reorder_prim.reset(new reorder(
+        *context_.reorder_pd, *context_.src_mem, *context_.dst_mem));
+    context_.net.push_back(*context_.reorder_prim);
+#else
+    context_.reorder_prim.reset(new reorder(*context_.reorder_pd));
+    context_.prim_args.insert({MKLDNN_ARG_FROM, *context_.src_mem});
+    context_.prim_args.insert({MKLDNN_ARG_TO, *context_.dst_mem});
+#endif  // !ENABLE_MKLDNN_V1
+    context_.reorder_stream.reset(new CPU_STREAM(cpu_engine_));
   }
 };
 
@@ -173,11 +178,10 @@ class MklReorderWithScalePrimitiveFactory : public MklPrimitiveFactory<T> {
         MklReorderWithScalePrimitiveFactory<T>::GetInstance().GetReorder(
             from, to, fwdParams));
     if (reorderPrim == nullptr) {
-      reorderPrim = new MklReorderWithScalePrimitive(from, to, fwdParams);
+      reorderPrim = new MklReorderWithScalePrimitive(fwdParams);
       MklReorderWithScalePrimitiveFactory<T>::GetInstance().SetReorder(
           from, to, reorderPrim, fwdParams);
     }
-    reorderPrim->SetMemory(from, to);
     return reorderPrim;
   }
 
@@ -192,20 +196,8 @@ class MklReorderWithScalePrimitiveFactory : public MklPrimitiveFactory<T> {
 
   static string CreateKey(const memory* from, const memory* to,
                           const MklReorderWithScaleFwdParams& fwdParams) {
-    string dtypes = string("");
-    string prefix = "reorder";
     FactoryKeyCreator key_creator;
-    auto const& from_desc = from->get_primitive_desc().desc().data;
-    auto const& to_desc = to->get_primitive_desc().desc().data;
-
-    key_creator.AddAsKey(prefix);
-    key_creator.AddAsKey(static_cast<int>(from_desc.format));
-    key_creator.AddAsKey(static_cast<int>(from_desc.data_type));
-    key_creator.AddAsKey(fwdParams.src_dims);
-    key_creator.AddAsKey(static_cast<int>(to_desc.format));
-    key_creator.AddAsKey(static_cast<int>(to_desc.data_type));
-    key_creator.AddAsKey(fwdParams.dtypes);
-
+    key_creator.AddAsKey(MklReorderPrimitiveFactory<T>::CreateKey(from, to));
     // Generate key for post-op scale
     if (fwdParams.post_op_params.name == "scale") {
       DCHECK_EQ(fwdParams.post_op_params.param.size(), 1);
@@ -231,21 +223,6 @@ class MklReorderWithScalePrimitiveFactory : public MklPrimitiveFactory<T> {
   }
 };
 
-// Fuction to find (or create) a reorder from memory pointed by
-// 'from' to memory pointed by 'to', it will create primitive or
-// get primitive from pool if it is cached.
-// Returns the primitive.
-template <typename T>
-inline primitive FindOrCreateReorder(
-    const memory* from, const memory* to,
-    const MklReorderWithScaleFwdParams& fwdParams) {
-  DCHECK(from);
-  DCHECK(to);
-  MklReorderWithScalePrimitive* reorder_prim =
-      MklReorderWithScalePrimitiveFactory<T>::Get(from, to, fwdParams);
-  return *reorder_prim->GetPrimitive();
-}
-
 // Quantizes a tensor from float to T, with user-specified min_range and
 // max_range.
 template <typename Device, typename T>
@@ -300,7 +277,7 @@ class MklQuantizeV2Op : public OpKernel {
                     "Scalar calculation in MKL is supported only for"
                     "MIN_FIRST mode for now."));
 
-    auto cpu_engine = engine(engine::cpu, 0);
+    auto cpu_engine = engine(ENGINE_CPU, 0);
     const Tensor& input = ctx->input(0);
     const unsigned int src_idx = 0;
     const Tensor& src_tensor = MklGetInput(ctx, src_idx);
@@ -366,7 +343,7 @@ class MklQuantizeV2Op : public OpKernel {
     max_range = std::max(input_max_range, min_range + epsilon);
     // Clamping the max_range to zero since max_range can also be negative.
     max_range = std::max(0.0f, max_range);
-    auto cpu_engine = engine(engine::cpu, 0);
+    auto cpu_engine = engine(ENGINE_CPU, 0);
     const Tensor& src_tensor = MklGetInput(ctx, src_idx);
     MklDnnShape src_mkl_shape;
     GetMklShape(ctx, src_idx, &src_mkl_shape);
@@ -377,25 +354,25 @@ class MklQuantizeV2Op : public OpKernel {
                         : TFShapeToMklDnnDims(src_tensor.shape());
     auto output_dims = src_dims;
     // Set the dst layout to be the best mkl layout based on dims and type.
-    memory::format dst_layout_type;
+    MEMORY_FORMAT dst_layout_type;
     switch (src_tf_shape.dims()) {
       case 0:
         ComputeScalar(ctx, min_range, max_range);
         return;
       case 1:
-        dst_layout_type = memory::format::x;
+        dst_layout_type = MEMORY_FORMAT::x;
         break;
       case 2:
-        dst_layout_type = memory::format::nc;
+        dst_layout_type = MEMORY_FORMAT::nc;
         break;
       case 3:
-        dst_layout_type = memory::format::tnc;
+        dst_layout_type = MEMORY_FORMAT::tnc;
         break;
       case 4:
-        dst_layout_type = memory::format::nhwc;
+        dst_layout_type = MEMORY_FORMAT::nhwc;
         break;
       case 5:
-        dst_layout_type = memory::format::ndhwc;
+        dst_layout_type = MEMORY_FORMAT::ndhwc;
         break;
       default:
         OP_REQUIRES_OK(ctx,
@@ -414,11 +391,11 @@ class MklQuantizeV2Op : public OpKernel {
     // If the mode is min_first, input data has to be subtracted from
     // min_range, before being scaled
     auto flat_input = input.flat<float>().data();
-    Tensor minfirst_tmpinput;
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_temp(DT_FLOAT, input.shape(), &minfirst_tmpinput));
+    Tensor min_shifted_input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, input.shape(),
+                                           &min_shifted_input_tensor));
     if (mode_ == QUANTIZE_MODE_MIN_FIRST) {
-      auto minfirst_input = minfirst_tmpinput.flat<float>().data();
+      auto minfirst_input = min_shifted_input_tensor.flat<float>().data();
       const Eigen::TensorOpCost cost(
           sizeof(float), /*load bytes*/
           sizeof(float), /*saved bytes*/
@@ -432,25 +409,27 @@ class MklQuantizeV2Op : public OpKernel {
       };
       d.parallelFor(input.NumElements(), cost, ParallelSub);
 
-      src.SetUsrMem(src_md, minfirst_input);
+      src.SetUsrMem(src_md, &min_shifted_input_tensor);
     } else {
       src.SetUsrMem(src_md, &src_tensor);
     }
 
     memory::desc dst_md =
         memory::desc(src_dims, MklDnnType<T>(), dst_layout_type);
-    auto dst_pd = src.GetUsrMemPrimDesc();
+#ifndef ENABLE_MKLDNN_V1
+    auto dst_pd = memory::primitive_desc(dst_md, cpu_engine);
+#endif  // !ENABLE_MKLDNN_V1
     // Standard shape assignments for layout pass
     MklDnnShape output_mkl_shape;
     TensorShape output_tf_shape;
     if (src_mkl_shape.IsMklTensor()) {
       output_mkl_shape.SetMklTensor(true);
-      output_mkl_shape.SetMklLayout(&dst_md);
+      output_mkl_shape.SetMklLayout(&DST_MD);
       output_mkl_shape.SetElemType(MklDnnType<T>());
       output_mkl_shape.SetTfLayout(src_mkl_shape.GetDimension(),
                                    src_mkl_shape.GetSizesAsMklDnnDims(),
                                    src_mkl_shape.GetTfDataFormat());
-      output_tf_shape.AddDim(dst_pd.get_size() / sizeof(T));
+      output_tf_shape.AddDim(DST_MD.get_size() / sizeof(T));
     } else {
       output_mkl_shape.SetMklTensor(false);
       output_tf_shape = MklDnnDimsToTFShape(output_dims);
@@ -459,6 +438,8 @@ class MklQuantizeV2Op : public OpKernel {
     Tensor* output_tensor = nullptr;
     AllocateOutputSetMklShape(ctx, 0, &output_tensor, output_tf_shape,
                               output_mkl_shape);
+    dst.SetUsrMem(dst_md, output_tensor);
+
     TensorShape min_tf_shape = {};
     MklDnnShape min_mkl_shape;
     min_mkl_shape.SetMklTensor(false);
@@ -472,8 +453,6 @@ class MklQuantizeV2Op : public OpKernel {
     AllocateOutputSetMklShape(ctx, 2, &output_max_tensor, max_tf_shape,
                               max_mkl_shape);
 
-    dst.SetUsrMem(dst_md, output_tensor);
-
     float scale_factor = 0;
     if (mode_ == QUANTIZE_MODE_SCALED) {
       // Estimating scales for quantization.
@@ -497,41 +476,25 @@ class MklQuantizeV2Op : public OpKernel {
         target_range = static_cast<float>((uint64_t{1} << num_bits) - 1);
       }
       scale_factor = target_range / max_abs;
-
-      output_min_tensor->flat<float>()(0) = min_range;
-      output_max_tensor->flat<float>()(0) = max_range;
-
-      // Primitive creation and stream submit
-      std::vector<float> scales{scale_factor};
-      mkldnn::primitive_attr attr;
-      attr.set_output_scales(0, scales);
-      auto reorder_desc = reorder::primitive_desc(
-          src.GetUsrMemPrimDesc(), dst.GetUsrMemPrimDesc(), attr);
-      reorder my_reorder = reorder(
-          reorder_desc, primitive::at(*src.GetUsrMem()), *dst.GetUsrMem());
-      std::vector<primitive> net{my_reorder};
-      stream(stream::kind::eager).submit(net).wait();
     } else if (mode_ == QUANTIZE_MODE_MIN_FIRST) {
       // Estimate scale for qunatization
       const int number_of_bits = sizeof(T) * 8;
       const int64 number_of_steps = static_cast<int64>(1) << number_of_bits;
       scale_factor = (number_of_steps - 1.0) / (max_range - min_range);
-
-      output_min_tensor->flat<float>()(0) = min_range;
-      output_max_tensor->flat<float>()(0) = max_range;
-
-      MklReorderWithScaleFwdParams fwdParams(src_dims, src_md, dst_md);
-      fwdParams.dtypes.append(typeid(T).name());
-
-      fwdParams.post_op_params.name = "scale";
-      fwdParams.post_op_params.param.push_back(scale_factor);
-
-      // Get primitive from pool or create one and submit
-      std::vector<primitive> net;
-      net.push_back(
-          FindOrCreateReorder<T>(src.GetUsrMem(), dst.GetUsrMem(), fwdParams));
-      stream(stream::kind::eager).submit(net).wait();
     }
+
+    MklReorderWithScaleFwdParams fwdParams(src_dims, src_md, dst_md);
+    fwdParams.dtypes.append(typeid(T).name());
+    fwdParams.post_op_params.name = "scale";
+    fwdParams.post_op_params.param.push_back(scale_factor);
+
+    MklReorderWithScalePrimitive* reorder_prim =
+        MklReorderWithScalePrimitiveFactory<T>::Get(src.GetUsrMem(),
+                                                    dst.GetUsrMem(), fwdParams);
+    reorder_prim->Execute(src.GetUsrMemDataHandle(), dst.GetUsrMemDataHandle());
+
+    output_min_tensor->flat<float>()(0) = min_range;
+    output_max_tensor->flat<float>()(0) = max_range;
   }
 
  private:
diff --git a/tensorflow/core/util/mkl_types.h b/tensorflow/core/util/mkl_types.h
index eede9b6087f..558c57a1851 100644
--- a/tensorflow/core/util/mkl_types.h
+++ b/tensorflow/core/util/mkl_types.h
@@ -39,6 +39,7 @@ namespace tensorflow {
 #define GET_MEMORY_DESC_FROM_MEM_PTR(mem_ptr) mem_ptr->get_desc()
 #define GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(mem_ptr) \
   GET_MEMORY_DESC_FROM_MEM_PTR(mem_ptr)
+#define GET_MEMORY_SIZE_FROM_MD(md, engine) md.get_size()
 #define GET_SRC_DESC_FROM_OP_PD(op_pd) op_pd->src_desc()
 #define GET_DIFF_DST_DESC_FROM_OP_PD(op_pd) op_pd->diff_dst_desc()
 #define GET_WORKSPACE_DESC_FROM_OP_PD(op_pd) op_pd->workspace_desc()
@@ -131,6 +132,8 @@ namespace tensorflow {
 #define GET_BLOCK_STRIDES(strides, idx) strides[(idx)]
 #define GET_MEMORY_DESC_CONSTRUCTOR(dims, type, fm) \
   { {dims}, MklDnnType<type>(), fm }
+#define GET_MEMORY_SIZE_FROM_MD(md, engine) \
+  memory::primitive_desc(md, engine).get_size()
 #define GET_SRC_DESC_FROM_OP_PD(op_pd) op_pd.get()->src_primitive_desc()
 #define GET_DIFF_DST_DESC_FROM_OP_PD(op_pd) \
   op_pd.get()->diff_dst_primitive_desc()
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index a782e76547b..582b0525323 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -2078,10 +2078,6 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
     return instance_;
   }
 
- private:
-  MklReorderPrimitiveFactory() {}
-  ~MklReorderPrimitiveFactory() {}
-
   static string CreateKey(const memory* from, const memory* to) {
     string prefix = "reorder";
     FactoryKeyCreator key_creator;
@@ -2117,6 +2113,10 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
     return key_creator.GetKey();
   }
 
+ private:
+  MklReorderPrimitiveFactory() {}
+  ~MklReorderPrimitiveFactory() {}
+
   MklPrimitive* GetReorder(const memory* from, const memory* to) {
     string key = CreateKey(from, to);
     return this->GetOp(key);

From b15bccccbcddef2fa576e14b7e67a06e10f11690 Mon Sep 17 00:00:00 2001
From: Niranjan Hasabnis <niranjan.hasabnis@intel.com>
Date: Sat, 15 Feb 2020 10:20:13 -0800
Subject: [PATCH 086/442] Addressing comments

---
 .../core/kernels/mkl_batch_matmul_op.cc       | 159 ++++++++----------
 tensorflow/core/kernels/mkl_matmul_op.cc      |   4 +-
 .../core/kernels/mkl_matmul_op_fused.cc       |   4 +-
 .../core/kernels/mkl_matmul_ops_common.h      |  13 +-
 tensorflow/core/kernels/mkl_qmatmul_op.cc     |  20 +--
 tensorflow/core/util/mkl_types.h              |   2 +-
 6 files changed, 85 insertions(+), 117 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
index f96f0e1183f..f409d2a8cb5 100644
--- a/tensorflow/core/kernels/mkl_batch_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
@@ -174,122 +174,105 @@ class BatchMatMulMkl : public OpKernel {
       }
     }
 
-    MklCblasGemmBatch(CblasRowMajor, adj_x_, adj_y_, m_array, n_array, k_array,
-                      &a_array[0], lda_array, &b_array[0], ldb_array,
-                      &c_array[0], ldc_array, 1, group_size);
+    MklCblasGemmBatch<Scalar>(
+        CblasRowMajor, adj_x_, adj_y_, m_array, n_array, k_array,
+        reinterpret_cast<const void**>(&a_array[0]), lda_array,
+        reinterpret_cast<const void**>(&b_array[0]), ldb_array,
+        reinterpret_cast<void**>(&c_array[0]), ldc_array, 1, group_size);
   }
 
  private:
   bool adj_x_;
   bool adj_y_;
 
+  template <typename T,
+            typename std::enable_if<(std::is_same<T, float>::value ||
+                                     std::is_same<T, double>::value),
+                                    int>::type = 0>
   void MklCblasGemmBatch(
       const CBLAS_LAYOUT Layout, const bool TransA, const bool TransB,
       const std::vector<MKL_INT>& M_Array, const std::vector<MKL_INT>& N_Array,
-      const std::vector<MKL_INT>& K_Array, const float** A_Array,
-      const std::vector<MKL_INT>& lda_Array, const float** B_Array,
-      const std::vector<MKL_INT>& ldb_Array, float** C_Array,
+      const std::vector<MKL_INT>& K_Array, const void** A_Array,
+      const std::vector<MKL_INT>& lda_Array, const void** B_Array,
+      const std::vector<MKL_INT>& ldb_Array, void** C_Array,
       const std::vector<MKL_INT>& ldc_Array, const MKL_INT group_count,
       const std::vector<MKL_INT>& group_size) {
     std::vector<CBLAS_TRANSPOSE> TransA_Array(
         group_size[0], TransA ? CblasTrans : CblasNoTrans);
     std::vector<CBLAS_TRANSPOSE> TransB_Array(
         group_size[0], TransB ? CblasTrans : CblasNoTrans);
-    std::vector<float> alpha_Array(group_size[0], 1.0);
-    std::vector<float> beta_Array(group_size[0], 0.0);
-    cblas_sgemm_batch(Layout, &TransA_Array[0], &TransB_Array[0], &M_Array[0],
-                      &N_Array[0], &K_Array[0], &alpha_Array[0], A_Array,
-                      &lda_Array[0], B_Array, &ldb_Array[0], &beta_Array[0],
-                      C_Array, &ldc_Array[0], group_count, &group_size[0]);
+    if (std::is_same<T, float>::value) {
+      std::vector<float> alpha_Array(group_size[0], 1.0);
+      std::vector<float> beta_Array(group_size[0], 0.0);
+      cblas_sgemm_batch(Layout, &TransA_Array[0], &TransB_Array[0], &M_Array[0],
+                        &N_Array[0], &K_Array[0], &alpha_Array[0],
+                        reinterpret_cast<const float**>(A_Array), &lda_Array[0],
+                        reinterpret_cast<const float**>(B_Array), &ldb_Array[0],
+                        &beta_Array[0], reinterpret_cast<float**>(C_Array),
+                        &ldc_Array[0], group_count, &group_size[0]);
+    } else {
+      std::vector<double> alpha_Array(group_size[0], 1.0);
+      std::vector<double> beta_Array(group_size[0], 0.0);
+      cblas_dgemm_batch(
+          Layout, &TransA_Array[0], &TransB_Array[0], &M_Array[0], &N_Array[0],
+          &K_Array[0], &alpha_Array[0],
+          reinterpret_cast<const double**>(A_Array), &lda_Array[0],
+          reinterpret_cast<const double**>(B_Array), &ldb_Array[0],
+          &beta_Array[0], reinterpret_cast<double**>(C_Array), &ldc_Array[0],
+          group_count, &group_size[0]);
+    }
   }
 
-#ifdef ENABLE_MKLDNN_V1_2
+  template <typename T,
+            typename std::enable_if<(std::is_same<T, complex64>::value ||
+                                     std::is_same<T, complex128>::value),
+                                    int>::type = 0>
   void MklCblasGemmBatch(
       const CBLAS_LAYOUT Layout, const bool TransA, const bool TransB,
       const std::vector<MKL_INT>& M_Array, const std::vector<MKL_INT>& N_Array,
-      const std::vector<MKL_INT>& K_Array, const bfloat16** A_Array,
-      const std::vector<MKL_INT>& lda_Array, const bfloat16** B_Array,
-      const std::vector<MKL_INT>& ldb_Array, bfloat16** C_Array,
+      const std::vector<MKL_INT>& K_Array, const void** A_Array,
+      const std::vector<MKL_INT>& lda_Array, const void** B_Array,
+      const std::vector<MKL_INT>& ldb_Array, void** C_Array,
+      const std::vector<MKL_INT>& ldc_Array, const MKL_INT group_count,
+      const std::vector<MKL_INT>& group_size) {
+    std::vector<CBLAS_TRANSPOSE> TransA_array(
+        group_size[0], TransA ? CblasConjTrans : CblasNoTrans);
+    std::vector<CBLAS_TRANSPOSE> TransB_array(
+        group_size[0], TransB ? CblasConjTrans : CblasNoTrans);
+    std::vector<T> alpha_Array(group_size[0], {1.0f, 0.0f});
+    std::vector<T> beta_Array(group_size[0], {0.0f, 0.0f});
+    auto gemm_fn = (std::is_same<T, complex64>::value) ? cblas_cgemm_batch
+                                                       : cblas_zgemm_batch;
+    gemm_fn(Layout, &TransA_array[0], &TransB_array[0], &M_Array[0],
+            &N_Array[0], &K_Array[0], static_cast<const void*>(&alpha_Array[0]),
+            reinterpret_cast<const void**>(A_Array), &lda_Array[0],
+            reinterpret_cast<const void**>(B_Array), &ldb_Array[0],
+            static_cast<const void*>(&beta_Array[0]),
+            reinterpret_cast<void**>(C_Array), &ldc_Array[0], group_count,
+            &group_size[0]);
+  }
+
+#ifdef ENABLE_MKLDNN_V1_2
+  void MklCblasGemmBatch<bfloat16>(
+      const CBLAS_LAYOUT Layout, const bool TransA, const bool TransB,
+      const std::vector<MKL_INT>& M_Array, const std::vector<MKL_INT>& N_Array,
+      const std::vector<MKL_INT>& K_Array, const void** A_Array,
+      const std::vector<MKL_INT>& lda_Array, const void** B_Array,
+      const std::vector<MKL_INT>& ldb_Array, void** C_Array,
       const std::vector<MKL_INT>& ldc_Array, const MKL_INT group_count,
       const std::vector<MKL_INT>& group_size) {
     std::vector<CBLAS_TRANSPOSE> TransA_Array(group_size[0], TransA);
     std::vector<CBLAS_TRANSPOSE> TransB_Array(group_size[0], TransB);
     std::vector<float> alpha_Array(group_size[0], 1.0);
     std::vector<float> beta_Array(group_size[0], 0.0);
-    dnnl_gemm_batch<bfloat16>(Layout, TransA_Array, TransB_Array, M_Array,
-                              N_Array, K_Array, alpha_Array, A_Array, lda_Array,
-                              B_Array, ldb_Array, beta_Array, C_Array,
-                              ldc_Array, group_count, group_size);
+    dnnl_gemm_batch<bfloat16>(
+        Layout, TransA_Array, TransB_Array, M_Array, N_Array, K_Array,
+        alpha_Array, reinterpret_cast<const bfloat16**>(A_Array), lda_Array,
+        reinterpret_cast<const bfloat16**>(B_Array), ldb_Array, beta_Array,
+        reinterpret_cast<bfloat16**>(C_Array), ldc_Array, group_count,
+        group_size);
   }
 #endif  // ENABLE_MKLDNN_V1_2
-
-  void MklCblasGemmBatch(
-      const CBLAS_LAYOUT Layout, const bool TransA, const bool TransB,
-      const std::vector<MKL_INT>& M_Array, const std::vector<MKL_INT>& N_Array,
-      const std::vector<MKL_INT>& K_Array, const double** A_Array,
-      const std::vector<MKL_INT>& lda_Array, const double** B_Array,
-      const std::vector<MKL_INT>& ldb_Array, double** C_Array,
-      const std::vector<MKL_INT>& ldc_Array, const MKL_INT group_count,
-      const std::vector<MKL_INT>& group_size) {
-    std::vector<CBLAS_TRANSPOSE> TransA_array(
-        group_size[0], TransA ? CblasTrans : CblasNoTrans);
-    std::vector<CBLAS_TRANSPOSE> TransB_array(
-        group_size[0], TransB ? CblasTrans : CblasNoTrans);
-    std::vector<double> alpha_Array(group_size[0], 1.0);
-    std::vector<double> beta_Array(group_size[0], 0.0);
-    cblas_dgemm_batch(Layout, &TransA_array[0], &TransB_array[0], &M_Array[0],
-                      &N_Array[0], &K_Array[0], &alpha_Array[0], A_Array,
-                      &lda_Array[0], B_Array, &ldb_Array[0], &beta_Array[0],
-                      C_Array, &ldc_Array[0], group_count, &group_size[0]);
-  }
-
-  void MklCblasGemmBatch(
-      const CBLAS_LAYOUT Layout, const bool TransA, const bool TransB,
-      const std::vector<MKL_INT>& M_Array, const std::vector<MKL_INT>& N_Array,
-      const std::vector<MKL_INT>& K_Array, const complex64** A_Array,
-      const std::vector<MKL_INT>& lda_Array, const complex64** B_Array,
-      const std::vector<MKL_INT>& ldb_Array, complex64** C_Array,
-      const std::vector<MKL_INT>& ldc_Array, const MKL_INT group_count,
-      const std::vector<MKL_INT>& group_size) {
-    std::vector<CBLAS_TRANSPOSE> TransA_array(
-        group_size[0], TransA ? CblasConjTrans : CblasNoTrans);
-    std::vector<CBLAS_TRANSPOSE> TransB_array(
-        group_size[0], TransB ? CblasConjTrans : CblasNoTrans);
-    std::vector<complex64> alpha_Array(group_size[0], {1.0f, 0.0f});
-    std::vector<complex64> beta_Array(group_size[0], {0.0f, 0.0f});
-    cblas_cgemm_batch(Layout, &TransA_array[0], &TransB_array[0], &M_Array[0],
-                      &N_Array[0], &K_Array[0],
-                      static_cast<const void*>(&alpha_Array[0]),
-                      reinterpret_cast<const void**>(A_Array), &lda_Array[0],
-                      reinterpret_cast<const void**>(B_Array), &ldb_Array[0],
-                      static_cast<const void*>(&beta_Array[0]),
-                      reinterpret_cast<void**>(C_Array), &ldc_Array[0],
-                      group_count, &group_size[0]);
-  }
-
-  void MklCblasGemmBatch(
-      const CBLAS_LAYOUT Layout, const bool TransA, const bool TransB,
-      const std::vector<MKL_INT>& M_Array, const std::vector<MKL_INT>& N_Array,
-      const std::vector<MKL_INT>& K_Array, const complex128** A_Array,
-      const std::vector<MKL_INT>& lda_Array, const complex128** B_Array,
-      const std::vector<MKL_INT>& ldb_Array, complex128** C_Array,
-      const std::vector<MKL_INT>& ldc_Array, const MKL_INT group_count,
-      const std::vector<MKL_INT>& group_size) {
-    std::vector<CBLAS_TRANSPOSE> TransA_array(
-        group_size[0], TransA ? CblasConjTrans : CblasNoTrans);
-    std::vector<CBLAS_TRANSPOSE> TransB_array(
-        group_size[0], TransB ? CblasConjTrans : CblasNoTrans);
-    std::vector<complex128> alpha_Array(group_size[0], {1.0f, 0.0f});
-    std::vector<complex128> beta_Array(group_size[0], {0.0f, 0.0f});
-    cblas_zgemm_batch(Layout, &TransA_array[0], &TransB_array[0], &M_Array[0],
-                      &N_Array[0], &K_Array[0],
-                      static_cast<const void*>(&alpha_Array[0]),
-                      reinterpret_cast<const void**>(A_Array), &lda_Array[0],
-                      reinterpret_cast<const void**>(B_Array), &ldb_Array[0],
-                      static_cast<const void*>(&beta_Array[0]),
-                      reinterpret_cast<void**>(C_Array), &ldc_Array[0],
-                      group_count, &group_size[0]);
-  }
 };
 
 #define REGISTER_BATCH_MATMUL_MKL(TYPE)                                       \
diff --git a/tensorflow/core/kernels/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl_matmul_op.cc
index 83d8255bdaa..b1e5a15b95a 100644
--- a/tensorflow/core/kernels/mkl_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_matmul_op.cc
@@ -268,10 +268,10 @@ class MklMatMulOp : public OpKernel {
 // TODO(inteltf) Consider template specialization when adding/removing
 // additional types
 TF_CALL_float(REGISTER_CPU);
-#ifndef ENABLE_MKLDNN_V1
+#if !defined(ENABLE_MKLDNN_V1) || defined(ENABLE_MKLDNN_V1_2)
 // MKLDNNv1 does not have support for bfloat16 GEMM. Only V1.2 has that support.
 TF_CALL_bfloat16(REGISTER_CPU);
-#endif  // ENABLE_MKLDNN_V1
+#endif  // !defined(ENABLE_MKLDNN_V1) || defined(ENABLE_MKLDNN_V1_2)
 
 #ifndef INTEL_MKL_DNN_ONLY
 TF_CALL_double(REGISTER_CPU);
diff --git a/tensorflow/core/kernels/mkl_matmul_op_fused.cc b/tensorflow/core/kernels/mkl_matmul_op_fused.cc
index 755919d8e68..20d5ce3a1ec 100644
--- a/tensorflow/core/kernels/mkl_matmul_op_fused.cc
+++ b/tensorflow/core/kernels/mkl_matmul_op_fused.cc
@@ -187,7 +187,7 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T> {
 
   void ExtendMklDnnMatMulFwdParams(OpKernelContext* ctx,
                                    MklDnnMatMulFwdParams& params) {
-#ifndef ENABLE_MKL_DNN_V1
+#ifndef ENABLE_MKLDNN_V1
     if (fused_ops_.size() == 2) {
       string post_op = fused_ops_[1];
 
@@ -203,7 +203,7 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T> {
                      "Unsupported post-argument in MklFusedMatMul: ", post_op));
       }
     }
-#endif
+#endif  // !ENABLE_MKLDNN_V1
   }
 
  private:
diff --git a/tensorflow/core/kernels/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl_matmul_ops_common.h
index 44eecc65b94..3147921b8d3 100644
--- a/tensorflow/core/kernels/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl_matmul_ops_common.h
@@ -97,11 +97,8 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
 
 #ifdef ENABLE_MKLDNN_V1
-    DCHECK_EQ(context_.fwd_primitives.size(), context_.net_args.size());
-    for (size_t i = 0; i < context_.fwd_primitives.size(); ++i) {
-      context_.fwd_primitives.at(i).execute(*context_.fwd_stream,
-                                            context_.net_args.at(i));
-    }
+    execute_primitives(context_.fwd_primitives, context_.fwd_stream,
+                       context_.net_args);
 #else
     context_.fwd_stream->submit(context_.fwd_primitives);
 #endif  // ENABLE_MKLDNN_V1
@@ -117,7 +114,7 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
   // In MKL-DNN v1.x, memory format tags only provide a partial description
   // of the memory layout. Hence, these functions are disabled for v1.x.
   memory::format GetSrcMemoryFormat() const { return context_.src_fmt; }
-  memory::format GetweightMemoryFormat() const { return context_.weight_fmt; }
+  memory::format GetWeightMemoryFormat() const { return context_.weight_fmt; }
 #endif  // ENABLE_MKLDNN_V1
 
   std::shared_ptr<mkldnn::inner_product_forward::primitive_desc>
@@ -132,7 +129,7 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
     // Expected memory format for this primitive instance
     MEMORY_FORMAT src_fmt;
     MEMORY_FORMAT weight_fmt;
-#endif  // ENABLE_MKLDNN_V1
+#endif  // !ENABLE_MKLDNN_V1
 
     // MKL-DNN memory.
     std::shared_ptr<mkldnn::memory> src_mem;
@@ -164,7 +161,7 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
 #ifndef ENABLE_MKLDNN_V1
           src_fmt(MEMORY_FORMAT::any),
           weight_fmt(MEMORY_FORMAT::any),
-#endif  // ENABLE_MKLDNN_V1
+#endif  // !ENABLE_MKLDNN_V1
           src_mem(nullptr),
           weight_mem(nullptr),
           bias_mem(nullptr),
diff --git a/tensorflow/core/kernels/mkl_qmatmul_op.cc b/tensorflow/core/kernels/mkl_qmatmul_op.cc
index 311eeeb5221..743bf641298 100644
--- a/tensorflow/core/kernels/mkl_qmatmul_op.cc
+++ b/tensorflow/core/kernels/mkl_qmatmul_op.cc
@@ -243,11 +243,7 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Toutput> {
 
       // Check if src and weight data need to be reordered.
       Tinput* src_data = nullptr;
-#ifdef ENABLE_MKLDNN_V1
       if (IS_SRC_REORDER_NEEDED(src_md, matmul_fwd_pd, matmul_fwd)) {
-#else
-      if (src_md.data.format != matmul_fwd->GetSrcMemoryFormat()) {
-#endif
         src.SetUsrMem(src_md, &src_tensor);
         src.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
             matmul_fwd_pd.get()->PRIMITIVE_DESC_SRC, this->cpu_engine_));
@@ -258,11 +254,7 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Toutput> {
       }
 
       Tweight* weight_data = nullptr;
-#ifdef ENABLE_MKLDNN_V1
       if (IS_WEIGHTS_REORDER_NEEDED(weight_md, matmul_fwd_pd, matmul_fwd)) {
-#else
-      if (weight_md.data.format != matmul_fwd->GetweightMemoryFormat()) {
-#endif
         bool is_weight_cached = false;
         // For batch size 1, MKL-DNN expects that weight format is OI whereas
         // TF default format is IO. So in that case convert weight from IO
@@ -280,7 +272,7 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Toutput> {
               context, static_cast<int32>(weight_mkl_shape.GetTfDataFormat()));
 #else
           weight_data = GetCachedWeight(
-              context, static_cast<int32>(matmul_fwd->GetweightMemoryFormat()));
+              context, static_cast<int32>(matmul_fwd->GetWeightMemoryFormat()));
 #endif
           is_weight_cached = (weight_data != nullptr);
         }
@@ -554,14 +546,10 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Toutput> {
     OP_REQUIRES_OK(context, context->allocate_persistent(
                                 DT_INT32, weight_mkl_format, &weight_oi_md,
                                 &weight_md_tensor_ptr));
-#ifdef ENABLE_MKLDNN_V1
-    // Using the logic from filter caching in mkl_conv_ops.cc
     weight_md_tensor_ptr->scalar<int32>()() =
-        static_cast<int32>(weight_mkl_shape.GetTfDataFormat());
-#else
-    weight_md_tensor_ptr->scalar<int32>()() =
-        matmul_fwd_pd.get()->weights_primitive_desc().desc().data.format;
-#endif  // ENABLE_MKLDNN_V1
+        static_cast<int32>(GET_TF_DATA_FORMAT(
+            weight_mkl_shape,
+            matmul_fwd_pd.get()->weights_primitive_desc().desc()));
   }
 
   Tweight* GetCachedWeight(OpKernelContext* context, int32 weight_mf)
diff --git a/tensorflow/core/util/mkl_types.h b/tensorflow/core/util/mkl_types.h
index 8e7c8e4e819..17df80d7000 100644
--- a/tensorflow/core/util/mkl_types.h
+++ b/tensorflow/core/util/mkl_types.h
@@ -149,7 +149,7 @@ namespace tensorflow {
 #define IS_SRC_REORDER_NEEDED(src_md, op_pd, op) \
   src_md.data.format != op->GetSrcMemoryFormat()
 #define IS_WEIGHTS_REORDER_NEEDED(weights_md, op_pd, op) \
-  weights_md.data.format != op->GetWeightsMemoryFormat()
+  weights_md.data.format != op->GetWeightMemoryFormat()
 #define GET_MEMORY_DESC_FROM_MEM_PTR(mem_ptr) \
   mem_ptr->get_primitive_desc().desc()
 #define GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(mem_ptr) \

From b7a908927e6a7c3d56fd6940277fcf3e809c60b6 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Sat, 15 Feb 2020 23:31:48 +0100
Subject: [PATCH 087/442] Handle return status and guard new TRT API usage

---
 .../tf2tensorrt/kernels/trt_engine_op.cc         |  1 -
 .../utils/trt_shape_optimization_profiles.cc     | 16 +++++++++++-----
 .../utils/trt_shape_optimization_profiles.h      |  8 ++++----
 .../trt_shape_optimization_profiles_test.cc      |  8 +++++---
 .../compiler/tensorrt/test/trt_mode_test.py      |  1 +
 5 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index 10f31b77096..f9e080da550 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -699,7 +699,6 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
                << ", but only 1 context is present.";
     return kRetry;
   }
-  auto& execution_context = engine_context->execution_context;
   const int num_binding = cuda_engine->getNbBindings();
   std::vector<void*> buffers(num_binding);
 
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
index 60ceac2077d..11ccc3e0c12 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
@@ -68,9 +68,9 @@ Status TrtShapeOptimizationProfile::AddProfiles(
       VLOG(1) << "Added optimization profile " << profiles_[i].DebugString()
               << " to builder config.";
     } else {
-      VLOG(ERROR) << "Failed to add optimization profile "
-                  << profiles_[i].DebugString()
-                  << ". This usually happens when profile is invalid.";
+      LOG(ERROR) << "Failed to add optimization profile "
+                 << profiles_[i].DebugString()
+                 << ". This usually happens when profile is invalid.";
     }
   }
   if (config->getNbOptimizationProfiles() == 0) {
@@ -85,7 +85,7 @@ Status TrtShapeOptimizationProfile::AddProfiles(
 Status TrtShapeOptimizationProfile::ConfigureBuilder(
     nvinfer1::IBuilder* builder, nvinfer1::IBuilderConfig* config,
     const nvinfer1::INetworkDefinition* network) {
-  AddProfiles(builder, config, network);
+  TF_RETURN_IF_ERROR(AddProfiles(builder, config, network));
   return Status::OK();
 }
 #endif
@@ -140,10 +140,16 @@ Status TrtShapeOptimizationProfile::CreateExecutionContexts(
 Status TrtShapeOptimizationProfile::RestoreProfiles(
     const nvinfer1::ICudaEngine* engine) {
 #if IS_TRT_VERSION_GE(6, 0, 0, 0)
-  if (!engine || engine->hasImplicitBatchDimension()) {
+  if (!engine) {
+    // We do not need to restore profiles for an empty engine
+    return Status::OK();
+  }
+#if IS_TRT_VERSION_GE(7, 0, 0, 0)
+  if (engine->hasImplicitBatchDimension()) {
     // Nothing to do, we cannot have profiles in implicit batch mode
     return Status::OK();
   }
+#endif
   int n_profiles = engine->getNbOptimizationProfiles();
   int n_inputs = GetNumberOfEngineInputs(engine);
   VLOG(2) << "Attempting to restore " << n_profiles << " profiles, each with "
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
index 281692c8b08..fd321898f17 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_
-#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_SHAPE_OPTIMIZATION_PROFILES_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_SHAPE_OPTIMIZATION_PROFILES_H_
 
 #include <list>
 #include <string>
@@ -117,7 +117,7 @@ struct OptimizationProfileConfig {
 // before the engine is created.
 class TrtShapeOptimizationProfile {
  public:
-  TrtShapeOptimizationProfile(){};
+  TrtShapeOptimizationProfile(){}
 
   // Stores input shape information during profile_generation_mode
   void AddShape(std::vector<TensorShape> shapes) {
@@ -175,4 +175,4 @@ class TrtShapeOptimizationProfile {
 
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
-#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_SHAPE_OPTIMIZATION_PROFILES_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_SHAPE_OPTIMIZATION_PROFILES_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
index 8efd65cdce5..ffc4156e8dd 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles_test.cc
@@ -37,7 +37,8 @@ std::vector<TensorShape> DimVecToShapeVec(std::vector<nvinfer1::Dims3> dimvec) {
   std::vector<TensorShape> shapevec(dimvec.size());
   for (int i = 0; i < dimvec.size(); i++) {
     TensorShape shape;
-    TensorShapeUtils::MakeShape(dimvec[i].d, dimvec[i].nbDims, &shape);
+    TF_CHECK_OK(TensorShapeUtils::MakeShape(dimvec[i].d, dimvec[i].nbDims,
+                                            &shape));
     shapevec[i] = shape;
   }
   return shapevec;
@@ -116,10 +117,11 @@ class TrtShapeOptimizationProfileTest : public ::testing::Test {
   std::vector<TrtUniquePtrType<nvinfer1::IExecutionContext>> exec_context_;
   // The order is important: exec_context_ must be destroyed first, and logger
   // at last.
-
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
   const uint32_t flags_ =
       1U << static_cast<int>(
           nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
+#endif
 };
 
 TEST_F(TrtShapeOptimizationProfileTest, Static) {
@@ -141,7 +143,7 @@ TEST_F(TrtShapeOptimizationProfileTest, Static) {
       builder_->buildCudaEngine(*network_));
 #endif
   EXPECT_NE(nullptr, engine);
-  profile.CreateExecutionContexts(engine.get(), exec_context_);
+  TF_CHECK_OK(profile.CreateExecutionContexts(engine.get(), exec_context_));
   // A single execution context should be created for a graph with static input
   ASSERT_EQ(exec_context_.size(), 1);
   EXPECT_NE(nullptr, exec_context_[0]);
diff --git a/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py b/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
index 415c16a114d..9a823ab56d4 100644
--- a/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
+
 class TrtModeTestBase(trt_test.TfTrtIntegrationTestBase):
   """Test squeeze on batch dim and some unary operations in TF-TRT."""
 

From be940c6d059557e8757391a8d73554d54796139d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 16 Feb 2020 08:46:45 -0800
Subject: [PATCH 088/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 295444320 Change-Id:
 Ia931402f0e9d7a005e710862d3e631bba83add36

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 86be1ef98aa..ffa9931d561 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21329,7 +21329,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22037,7 +22037,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22233,7 +22233,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22302,7 +22302,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22417,7 +22417,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22476,7 +22476,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22650,7 +22650,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22841,7 +22841,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25281,7 +25281,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25613,7 +25613,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25663,7 +25663,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25913,7 +25913,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26543,7 +26543,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27608,7 +27608,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45467,7 +45467,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 5589d47843656fad7a84d9ed156006e60ffab649 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 16 Feb 2020 12:46:27 -0800
Subject: [PATCH 089/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 295462068 Change-Id:
 I52f9eaf8900317fe643419c811c0d9c489486d26

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index ffa9931d561..86be1ef98aa 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21329,7 +21329,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22037,7 +22037,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22233,7 +22233,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22302,7 +22302,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22417,7 +22417,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22476,7 +22476,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22650,7 +22650,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22841,7 +22841,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25281,7 +25281,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25613,7 +25613,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25663,7 +25663,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25913,7 +25913,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26543,7 +26543,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27608,7 +27608,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45467,7 +45467,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 3383a6454578dc3cb3b8d7484d7b020abcd4e882 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Sun, 16 Feb 2020 22:43:03 +0100
Subject: [PATCH 090/442] Remove unnecessary move

---
 .../tf2tensorrt/utils/trt_shape_optimization_profiles.cc        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
index 11ccc3e0c12..6f19b8ead1c 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
@@ -130,7 +130,7 @@ Status TrtShapeOptimizationProfile::CreateExecutionContexts(
 #endif
     }
     exec_context.push_back(
-        std::move(TrtUniquePtrType<nvinfer1::IExecutionContext>(ctx)));
+        TrtUniquePtrType<nvinfer1::IExecutionContext>(ctx));
     i++;
   } while (i < profiles_.size());
 

From d6e8d078c9a9cd3d05f8d008673db6878d76a812 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 16 Feb 2020 14:46:23 -0800
Subject: [PATCH 091/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 295471666 Change-Id:
 Ib971e46e2b7958734af536447ea1fad2548d2092

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 86be1ef98aa..ffa9931d561 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21329,7 +21329,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22037,7 +22037,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22233,7 +22233,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22302,7 +22302,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22417,7 +22417,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22476,7 +22476,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22650,7 +22650,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22841,7 +22841,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25281,7 +25281,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25613,7 +25613,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25663,7 +25663,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25913,7 +25913,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26543,7 +26543,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27608,7 +27608,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45467,7 +45467,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 04903ab265d0c066179d2a5a1caff42c384f8007 Mon Sep 17 00:00:00 2001
From: Juho Ha <juhoha@google.com>
Date: Sun, 16 Feb 2020 15:01:11 -0800
Subject: [PATCH 092/442] Add missing exported files required to build
 tensorflow-lite(-gpu).aar

PiperOrigin-RevId: 295472743
Change-Id: Idb7219338ee087f2544a8af821c138e738913370
---
 tensorflow/lite/c/BUILD             | 1 +
 tensorflow/lite/delegates/gpu/BUILD | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/c/BUILD b/tensorflow/lite/c/BUILD
index b5b15c51932..f9549fc3571 100644
--- a/tensorflow/lite/c/BUILD
+++ b/tensorflow/lite/c/BUILD
@@ -128,6 +128,7 @@ cc_library(
 # For use with library targets that can't use relative paths.
 exports_files([
     "c_api.h",
+    "c_api_experimental.h",
     "common.h",
 ])
 
diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index 327a1a8677c..ba2a05b09ec 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -8,7 +8,7 @@ package(
 )
 
 exports_files([
-    "gpu_delegate.h",
+    "delegate.h",
     "metal_delegate.h",
 ])
 

From a00bd4687adac4d5f1880595262276e656375322 Mon Sep 17 00:00:00 2001
From: Yi Situ <yisitu@google.com>
Date: Sun, 16 Feb 2020 15:18:17 -0800
Subject: [PATCH 093/442] Fix build broken by signed/unsigned comparisons.

PiperOrigin-RevId: 295474359
Change-Id: I03d7f9653db2122be76d953bf93f19ec00e8d856
---
 tensorflow/core/profiler/internal/cpu/host_tracer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer.cc b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
index 479ca8b448f..4d54093a1e2 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
@@ -110,7 +110,7 @@ Status HostTracer::CollectData(RunMetadata* run_metadata) {
 
   constexpr char kUserMetadataMarker = '#';
   for (TraceMeRecorder::ThreadEvents& thread : events_) {
-    int32 thread_id = thread.thread.tid;
+    uint32_t thread_id = thread.thread.tid;
     thread_names->insert({thread_id, thread.thread.name});
     for (TraceMeRecorder::Event& event : thread.events) {
       if (event.start_time && event.end_time) {

From fd05051846fd9ceb090206600afd1a71ba852e20 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 16 Feb 2020 18:45:54 -0800
Subject: [PATCH 094/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 295491725 Change-Id:
 I6e9bf90f14d39bfde27b52d3489f661ac436a89c

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index ffa9931d561..86be1ef98aa 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21329,7 +21329,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22037,7 +22037,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22233,7 +22233,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22302,7 +22302,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22417,7 +22417,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22476,7 +22476,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22650,7 +22650,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22841,7 +22841,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25281,7 +25281,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25613,7 +25613,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25663,7 +25663,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25913,7 +25913,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26543,7 +26543,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27608,7 +27608,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45467,7 +45467,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 074d852ad3005519e1d45211c458406a27907ca4 Mon Sep 17 00:00:00 2001
From: TengLu <teng.lu@intel.com>
Date: Mon, 17 Feb 2020 11:42:27 +0800
Subject: [PATCH 095/442] Refine weight cache code according to review.

---
 .../core/kernels/mkl_matmul_ops_common.h       | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl_matmul_ops_common.h
index f80579b8bef..067b98e8f76 100644
--- a/tensorflow/core/kernels/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl_matmul_ops_common.h
@@ -379,7 +379,7 @@ class MklDnnMatMulOpBase : public OpKernel {
   // inside the function.
   inline bool IsWeightCacheEmpty(OpKernelContext* context) LOCKS_EXCLUDED(mu_) {
     tf_shared_lock lock(mu_);
-    return (weight_oi.NumElements() == 0);
+    return (weight_oi_.NumElements() == 0);
   }
 
   // Cache the converted weight in a persistent tensor.
@@ -392,9 +392,9 @@ class MklDnnMatMulOpBase : public OpKernel {
       MklDnnData<Tweight>& weight, const memory::desc& weight_md)
       LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
-    const Tensor& weight_t = *weight_oi.AccessTensor(context);
+    const Tensor& weight_t = *weight_oi_.AccessTensor(context);
 
-    // if the weights are already cahced, there's nothing to do
+    // If the weights are already cached, there's nothing to do
     if (weight_t.NumElements() > 0) {
       return;
     }
@@ -413,7 +413,7 @@ class MklDnnMatMulOpBase : public OpKernel {
 
     OP_REQUIRES_OK(context, context->allocate_persistent(
                                 DataTypeToEnum<Tweight>::value, weight_tf_shape,
-                                &weight_oi, &weight_tensor_ptr));
+                                &weight_oi_, &weight_tensor_ptr));
 
     void* weight_oi_t_data = weight.GetTensorBuffer(weight_tensor_ptr);
     size_t weight_size = weight.GetOpMem().get_primitive_desc().get_size();
@@ -425,7 +425,7 @@ class MklDnnMatMulOpBase : public OpKernel {
     weight_mkl_format.AddDim(1);
 
     OP_REQUIRES_OK(context, context->allocate_persistent(
-                                DT_INT32, weight_mkl_format, &weight_oi_md,
+                                DT_INT32, weight_mkl_format, &weight_oi_md_,
                                 &weight_md_tensor_ptr));
     weight_md_tensor_ptr->scalar<int32>()() =
         matmul_fwd_pd.get()->weights_primitive_desc().desc().data.format;
@@ -435,8 +435,8 @@ class MklDnnMatMulOpBase : public OpKernel {
                            const memory::format& weight_mf)
       LOCKS_EXCLUDED(mu_) {
     tf_shared_lock lock(mu_);
-    const Tensor& weight_t = *weight_oi.AccessTensor(context);
-    const Tensor& weight_md_t = *weight_oi_md.AccessTensor(context);
+    const Tensor& weight_t = *weight_oi_.AccessTensor(context);
+    const Tensor& weight_md_t = *weight_oi_md_.AccessTensor(context);
 
     // Check if the  memory descriptor of the cached weight is same as
     // weight_mf. if so use the cached memory, else return NULL
@@ -453,8 +453,8 @@ class MklDnnMatMulOpBase : public OpKernel {
  protected:
   // Tensor to save reordered weight
   mutex mu_;
-  PersistentTensor weight_oi GUARDED_BY(mu_);
-  PersistentTensor weight_oi_md GUARDED_BY(mu_);
+  PersistentTensor weight_oi_ GUARDED_BY(mu_);
+  PersistentTensor weight_oi_md_ GUARDED_BY(mu_);
 
   bool is_weight_const_;
 

From a4ecf3dc000b1b4886604628a7491370e13e80fb Mon Sep 17 00:00:00 2001
From: Thai Nguyen <thaink@google.com>
Date: Sun, 16 Feb 2020 22:19:32 -0800
Subject: [PATCH 096/442] Automatic NEON detection for ARM native build

PiperOrigin-RevId: 295513820
Change-Id: I89806905fe274577f5595d8a4a17139d27505cbc
---
 tensorflow/lite/tools/make/Makefile | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index 5bbb7f6a034..c3280f0e62c 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -68,6 +68,21 @@ ifeq ($(HOST_OS),windows)
 CXXFLAGS += -fext-numeric-literals -D__LITTLE_ENDIAN__
 endif
 
+# Auto-detect optimization opportunity if building natively.
+ifeq ($(HOST_OS),$(TARGET))
+ifeq ($(HOST_ARCH),$(TARGET_ARCH))
+ifeq ($(TARGET_ARCH),armv7l)
+ifneq ($(shell cat /proc/cpuinfo | grep Features | grep neon),)
+  ifneq ($(shell cat /proc/cpuinfo | grep Features | grep vfpv4),)
+    CXXFLAGS += -mfpu=neon-vfpv4
+  else
+    CXXFLAGS += -mfpu=neon
+  endif
+endif # ifeq ($(TARGET_ARCH),armv7l)
+endif # ifeq ($(HOST_ARCH),$(TARGET_ARCH))
+endif # ifeq ($(HOST_OS),$(TARGET))
+endif
+
 # This library is the main target for this makefile. It will contain a minimal
 # runtime that can be linked in to other programs.
 LIB_NAME := libtensorflow-lite.a

From 3c11fed56d03ff5a6eaba24f0550c43dedd68741 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 16 Feb 2020 22:56:14 -0800
Subject: [PATCH 097/442] Conversion rule for MatrixSetDiag, MatrixSetDiagV2,
 and MatrixSetDiagV3

PiperOrigin-RevId: 295516998
Change-Id: Ia8e26fee7edb8f199dfdc9be0970fbf94e90ee7e
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   |  23 ++++
 .../compiler/mlir/lite/tests/legalize-tf.mlir |   9 ++
 .../compiler/mlir/lite/tests/prepare-tf.mlir  |  31 +++++
 .../mlir/lite/transforms/legalize_patterns.td |   4 +
 .../mlir/lite/transforms/prepare_patterns.td  |  13 ++
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 124 ++++++++++++++++++
 6 files changed, 204 insertions(+)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 5b247a43442..9444aab6ce8 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -891,6 +891,29 @@ def TFL_MatrixDiagOp : TFL_Op<"matrix_diag", [
   let hasOptions = 0;
 }
 
+def TFL_MatrixSetDiagOp : TFL_Op<"matrix_set_diag", [NoSideEffect]> {
+  let summary = [{
+    Returns a batched matrix tensor with new batched diagonal values.
+  }];
+
+  let description = [{
+Given `input` and `diagonal`, this operation returns a tensor with the
+same shape and values as `input`, except for the main diagonal of the
+innermost matrices.  These will be overwritten by the values in `diagonal`.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F32, I32, I64, I8, QI8, QI16, QUI8, TFL_Uint8, TFL_Quint8]>:$input,
+    TensorOf<[F32, I32, I64, I8, QI8, QI16, QUI8, TFL_Uint8, TFL_Quint8]>:$diagonal
+  );
+
+  let results = (outs
+    TensorOf<[F32, I32, I64, I8, QI8, QI16, QUI8, TFL_Uint8, TFL_Quint8]>:$output
+  );
+
+  let hasOptions = 0;
+}
+
 // These ops are named NonMaxSuppressionV4 & NonMaxSuppressionV5 to be
 // consistent with TensorFlow's naming. They are NOT 'versions' of NMS in the
 // sense that one is an incremental change over the other.
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index e44128d587f..570e909e256 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -739,6 +739,15 @@ func @matrix_diag_v3(%arg0: tensor<8x16xf32>) -> tensor<8x16x16xf32> {
 // CHECK:           return [[VAL_6]] : tensor<8x16x16xf32>
 }
 
+func @matrix_set_diag(%arg0: tensor<3x3xi32>, %arg1: tensor<3xi32>) -> tensor<3x3xi32> {
+  %0 = "tf.MatrixSetDiag"(%arg0, %arg1) : (tensor<3x3xi32>, tensor<3xi32>) -> tensor<3x3xi32>
+  return %0 : tensor<3x3xi32>
+
+// CHECK-LABEL: func @matrix_set_diag(
+// CHECK: [[VAL_0:%.*]] = "tfl.matrix_set_diag"(%arg0, %arg1) : (tensor<3x3xi32>, tensor<3xi32>) -> tensor<3x3xi32>
+// CHECK: return [[VAL_0]]
+}
+
 func @maximum(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xf32> {
   %0 = "tf.Maximum"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
   return %0 : tensor<8x16xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
index 6c635bd3500..1aa1311318a 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
@@ -511,3 +511,34 @@ func @PadStridedSliceNewAxisMask2(%arg0: tensor<4x64x64x1xf32>) -> tensor<1x4x64
   %1 = "tf.StridedSlice"(%0, %cst, %cst, %cst_0) {Index = i32, T = f32, _output_shapes = ["tfshape$dim { size: 1 } dim { size: 4 } dim { size: 64 } dim { size: 64 }"], begin_mask = 6 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 6 : i64, new_axis_mask = 1 : i64, shrink_axis_mask = 0 : i64} : (tensor<4x64x64xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x4x64x64xf32>
   return %1 : tensor<1x4x64x64xf32>
 }
+
+// CHECK-LABEL: @MatrixSetDiagV2Conversion
+func @MatrixSetDiagV2Conversion(%arg0: tensor<3x3xi32>, %arg1: tensor<3xi32>) -> tensor<3x3xi32> {
+  %cst = constant dense<0> : tensor<i32>
+  %0 = "tf.MatrixSetDiagV2"(%arg0, %arg1, %cst) : (tensor<3x3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<3x3xi32>
+  return %0 : tensor<3x3xi32>
+
+  // CHECK: %[[RES:.*]] = "tf.MatrixSetDiag"(%arg0, %arg1) : (tensor<3x3xi32>, tensor<3xi32>) -> tensor<3x3xi32>
+  // CHECK: return %[[RES]]
+}
+
+// CHECK-LABEL: @MatrixSetDiagV2NonZeroK
+func @MatrixSetDiagV2NonZeroK(%arg0: tensor<3x3xi32>, %arg1: tensor<3xi32>) -> tensor<3x3xi32> {
+  %cst = constant dense<1> : tensor<i32>
+  %0 = "tf.MatrixSetDiagV2"(%arg0, %arg1, %cst) : (tensor<3x3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<3x3xi32>
+  return %0 : tensor<3x3xi32>
+
+  // CHECK: %[[CST:.*]] = constant dense<1> : tensor<i32>
+  // CHECK: %[[RES:.*]] = "tf.MatrixSetDiagV2"(%arg0, %arg1, %[[CST]]) : (tensor<3x3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<3x3xi32>
+  // CHECK: return %[[RES]]
+}
+
+// CHECK-LABEL: @MatrixSetDiagV3Conversion
+func @MatrixSetDiagV3Conversion(%arg0: tensor<3x3xi32>, %arg1: tensor<3xi32>) -> tensor<3x3xi32> {
+  %cst = constant dense<0> : tensor<i32>
+  %0 = "tf.MatrixSetDiagV3"(%arg0, %arg1, %cst) : (tensor<3x3xi32>, tensor<3xi32>, tensor<i32>) -> tensor<3x3xi32>
+  return %0 : tensor<3x3xi32>
+
+  // CHECK: %[[RES:.*]] = "tf.MatrixSetDiag"(%arg0, %arg1) : (tensor<3x3xi32>, tensor<3xi32>) -> tensor<3x3xi32>
+  // CHECK: return %[[RES]]
+}
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
index d638a5f1a60..7bc08ee1c76 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
@@ -365,3 +365,7 @@ def : Pat<
      /*padding=*/ $padding,
      /*stride_h=*/ ExtractI32At<1>:$strides,
      /*stride_w=*/ ExtractI32At<2>:$strides)>;
+
+def : Pat<
+  (TF_MatrixSetDiagOp $input, $diagonal),
+  (TFL_MatrixSetDiagOp $input, $diagonal)>;
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
index 7db615327e7..aed99a70bff 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
@@ -190,3 +190,16 @@ def : Pat<(TF_ReshapeOp:$old_value
 // parameters of the input, so we can remove the quantization ops.
 def : Pat<(TF_RankOp (TFL_DequantizeOp (TFL_QuantizeOp $input, $qtype))),
           (TF_RankOp $input)>;
+
+// `k` is expected to be 0, other values are not supported currently.
+def : Pat<(TF_MatrixSetDiagV2Op $input, $diagonal,
+            (ConstantOp ConstantAttr<I32ElementsAttr, "{0}">)),
+          (TF_MatrixSetDiagOp $input, $diagonal)>;
+
+// `align` attribute can be ignored because we only support converting
+// `MatrixSetDiagV3` to `MatrixSetDiag` with default `k` inputs.
+def : Pat<(TF_MatrixSetDiagV3Op $input, $diagonal,
+            (ConstantOp ConstantAttr<I32ElementsAttr, "{0}">),
+            $align),
+          (TF_MatrixSetDiagOp $input, $diagonal)>;
+
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 9b9a727d66e..ad00ab222a4 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -3392,6 +3392,130 @@ tf.matrix_diag(diagonal, k = -1, num_rows = 3, padding_value = 9)
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_MatrixSetDiagOp : TF_Op<"MatrixSetDiag", [NoSideEffect]> {
+  let summary = [{
+Returns a batched matrix tensor with new batched diagonal values.
+  }];
+
+  let description = [{
+Given `input` and `diagonal`, this operation returns a tensor with the
+same shape and values as `input`, except for the main diagonal of the
+innermost matrices.  These will be overwritten by the values in `diagonal`.
+
+The output is computed as follows:
+
+Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
+`k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
+tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
+
+  * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
+  * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+    TF_Tensor:$diagonal
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_MatrixSetDiagV2Op : TF_Op<"MatrixSetDiagV2", [NoSideEffect]> {
+  let summary = [{
+Returns a batched matrix tensor with new batched diagonal values.
+  }];
+
+  let description = [{
+Given `input` and `diagonal`, this operation returns a tensor with the
+same shape and values as `input`, except for the specified diagonals of the
+innermost matrices. These will be overwritten by the values in `diagonal`.
+
+`input` has `r+1` dimensions `[I, J, ..., L, M, N]`. When `k` is scalar or
+`k[0] == k[1]`, `diagonal` has `r` dimensions `[I, J, ..., L, max_diag_len]`.
+Otherwise, it has `r+1` dimensions `[I, J, ..., L, num_diags, max_diag_len]`.
+`num_diags` is the number of diagonals, `num_diags = k[1] - k[0] + 1`.
+`max_diag_len` is the longest diagonal in the range `[k[0], k[1]]`,
+`max_diag_len = min(M + min(k[1], 0), N + min(-k[0], 0))`
+
+The output is a tensor of rank `k+1` with dimensions `[I, J, ..., L, M, N]`.
+If `k` is scalar or `k[0] == k[1]`:
+
+```
+output[i, j, ..., l, m, n]
+  = diagonal[i, j, ..., l, n-max(k[1], 0)] ; if n - m == k[1]
+    input[i, j, ..., l, m, n]              ; otherwise
+```
+
+Otherwise,
+
+```
+output[i, j, ..., l, m, n]
+  = diagonal[i, j, ..., l, diag_index, index_in_diag] ; if k[0] <= d <= k[1]
+    input[i, j, ..., l, m, n]                         ; otherwise
+```
+where `d = n - m`, `diag_index = k[1] - d`, and `index_in_diag = n - max(d, 0)`.
+
+For example:
+
+```
+# The main diagonal.
+input = np.array([[[7, 7, 7, 7],              # Input shape: (2, 3, 4)
+                   [7, 7, 7, 7],
+                   [7, 7, 7, 7]],
+                  [[7, 7, 7, 7],
+                   [7, 7, 7, 7],
+                   [7, 7, 7, 7]]])
+diagonal = np.array([[1, 2, 3],               # Diagonal shape: (2, 3)
+                     [4, 5, 6]])
+tf.matrix_set_diag(diagonal) ==> [[[1, 7, 7, 7],  # Output shape: (2, 3, 4)
+                                   [7, 2, 7, 7],
+                                   [7, 7, 3, 7]],
+                                  [[4, 7, 7, 7],
+                                   [7, 5, 7, 7],
+                                   [7, 7, 6, 7]]]
+
+# A superdiagonal (per batch).
+tf.matrix_set_diag(diagonal, k = 1)
+  ==> [[[7, 1, 7, 7],  # Output shape: (2, 3, 4)
+        [7, 7, 2, 7],
+        [7, 7, 7, 3]],
+       [[7, 4, 7, 7],
+        [7, 7, 5, 7],
+        [7, 7, 7, 6]]]
+
+# A band of diagonals.
+diagonals = np.array([[[1, 2, 3],  # Diagonal shape: (2, 2, 3)
+                       [4, 5, 0]],
+                      [[6, 1, 2],
+                       [3, 4, 0]]])
+tf.matrix_set_diag(diagonals, k = (-1, 0))
+  ==> [[[1, 7, 7, 7],  # Output shape: (2, 3, 4)
+        [4, 2, 7, 7],
+        [0, 5, 3, 7]],
+       [[6, 7, 7, 7],
+        [3, 1, 7, 7],
+        [7, 4, 2, 7]]]
+
+```
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+    TF_Tensor:$diagonal,
+    I32Tensor:$k
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_MatrixSetDiagV3Op : TF_Op<"MatrixSetDiagV3", [NoSideEffect]> {
   let summary = [{
 Returns a batched matrix tensor with new batched diagonal values.

From 99c28c59f151c62681b305e60071aaea1bfffd11 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 17 Feb 2020 00:46:21 -0800
Subject: [PATCH 098/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 295529175 Change-Id:
 Ia2e17a0366372ff96774d881b289338a176b04fe

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 86be1ef98aa..ffa9931d561 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21329,7 +21329,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22037,7 +22037,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22233,7 +22233,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22302,7 +22302,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22417,7 +22417,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22476,7 +22476,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22650,7 +22650,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22841,7 +22841,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25281,7 +25281,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25613,7 +25613,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25663,7 +25663,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25913,7 +25913,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26543,7 +26543,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27608,7 +27608,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45467,7 +45467,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 2c5e22190c7aab844be380a91a126ba23854ad34 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 17 Feb 2020 01:02:36 -0800
Subject: [PATCH 099/442] compat: Update forward compatibility horizon to
 2020-02-17

PiperOrigin-RevId: 295531152
Change-Id: I1397d032d97060d6d174054fb74139f530a16d9a
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 76053c9e431..1dae10ae638 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -31,7 +31,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 2, 16)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 2, 17)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From e4c9dedb31df127aa6f52050f70f0084fd3e4c93 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 17 Feb 2020 04:32:04 -0800
Subject: [PATCH 100/442] Fix HLO cost anaylisis for rng-bit-generator

PiperOrigin-RevId: 295560878
Change-Id: Ib19f4a5a714853ce5b755321a9e6063b31acf573
---
 tensorflow/compiler/xla/service/hlo_cost_analysis.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index ef3809c1b94..2e089f34bac 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -751,7 +751,7 @@ Status HloCostAnalysis::HandleRngBitGenerator(const HloInstruction* random) {
   // cost changes with the implementation and the distribution. For now, assume
   // the cost of each RNG is same as a transcendental operation.
   current_properties_[kTranscendentalsKey] =
-      ShapeUtil::ElementsIn(random->shape());
+      ShapeUtil::ElementsInRecursive(random->shape());
   return Status::OK();
 }
 

From 4ab52e3bc007bc64488171407a7147123559ca94 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Mon, 17 Feb 2020 05:14:30 -0800
Subject: [PATCH 101/442] Add dialect registration dependency to MlirCompiler.

Due to a recent change, this dependency is now needed to register dialects.
This fixes the mlir_gpu_lhlo_gen_test.

PiperOrigin-RevId: 295566045
Change-Id: I5f8476c8e1a11e324223cb6be025918826135266
---
 tensorflow/compiler/xla/service/mlir_gpu/BUILD       | 1 +
 tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD | 5 +----
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/xla/service/mlir_gpu/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
index 36e20656974..1eab89da887 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
@@ -73,6 +73,7 @@ cc_library(
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TargetNVVMIR",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:compiler",
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
index 84f1c7668e5..05429224f6a 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
@@ -25,10 +25,7 @@ package_group(
 tf_cc_test(
     name = "mlir_gpu_lhlo_gen_test",
     srcs = if_cuda_is_configured(["mlir_gpu_lhlo_gen_test.cc"]),
-    tags = tf_cuda_tests_tags() + [
-        "no_rocm",
-        "no_oss",  # TODO(b/149544192): Fix the test.
-    ],
+    tags = tf_cuda_tests_tags() + ["no_rocm"],
     deps = [
         "//tensorflow/core:test_main",
         "//tensorflow/core:test",

From a4e0fca9c4a0e95cabcc7bf7bdf29df6bbb680a0 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Mon, 17 Feb 2020 06:16:32 -0800
Subject: [PATCH 102/442] [tf:mlir] Drop references to AllPassesAndDialects

PiperOrigin-RevId: 295574916
Change-Id: Ifa20291ccc73e3d352c3146d7f78f7f1fa6d02c7
---
 tensorflow/compiler/mlir/lite/BUILD            | 3 ---
 tensorflow/compiler/mlir/tensorflow/BUILD      | 2 --
 tensorflow/compiler/mlir/xla/BUILD             | 2 --
 tensorflow/compiler/xla/service/mlir_gpu/BUILD | 1 -
 4 files changed, 8 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 7f5da2ad3de..ce091dabd9e 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -582,7 +582,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@flatbuffers",
         "@llvm-project//llvm:support",
-        "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:StandardOps",
@@ -694,7 +693,6 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
-        "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -727,7 +725,6 @@ cc_library(
         "//tensorflow/lite/tools/optimize:quantize_weights",
         "//tensorflow/stream_executor/lib",
         "@llvm-project//llvm:support",
-        "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 0058e949969..f6a37c4a5f2 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -708,7 +708,6 @@ cc_library(
     deps = [
         ":tensorflow_dialect_registration",
         ":tf_dialect_passes",
-        "@llvm-project//mlir:AllPassesAndDialects",
     ],
 )
 
@@ -913,7 +912,6 @@ cc_library(
         "//tensorflow/core/platform:logging",
         "//tensorflow/stream_executor/lib",
         "@llvm-project//llvm:support",
-        "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index 8a2b18cd906..a4115479a0b 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -157,7 +157,6 @@ cc_library(
         ":lhlo",
         "@com_google_absl//absl/memory",
         "@llvm-project//llvm:support",
-        "@llvm-project//mlir:AllPassesAndDialects",  # TODO: only Linalg is needed
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgOps",
         "@llvm-project//mlir:Pass",
@@ -193,7 +192,6 @@ cc_library(
     deps = [
         ":lhlo",
         "@com_google_absl//absl/memory",
-        "@llvm-project//mlir:AllPassesAndDialects",  # TODO: only Linalg is needed
         "@llvm-project//mlir:LinalgOps",
         "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:Pass",
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
index 1eab89da887..51be8d6fdb5 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
@@ -159,7 +159,6 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@llvm-project//mlir:AffineToStandardTransforms",
-        "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:CFGTransforms",
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:GPUToNVVMTransforms",

From e95a9b71f8c3f812784bc6af8c8a6360506f2c56 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 17 Feb 2020 12:46:20 -0800
Subject: [PATCH 103/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 295616988 Change-Id:
 If0b09f1205e23f33dbc662e4f69bfcd83b01f48f

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index ffa9931d561..86be1ef98aa 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21329,7 +21329,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22037,7 +22037,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22233,7 +22233,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22302,7 +22302,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22417,7 +22417,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22476,7 +22476,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22650,7 +22650,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22841,7 +22841,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25281,7 +25281,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25613,7 +25613,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25663,7 +25663,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25913,7 +25913,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26543,7 +26543,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27608,7 +27608,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45467,7 +45467,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 799652174cf675fe8fedb807d0b2e87f1fae15d0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 17 Feb 2020 14:46:30 -0800
Subject: [PATCH 104/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 295626647 Change-Id:
 I19ae7816ab7c1ca6136ae1f9834c222ce19b7785

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 86be1ef98aa..ffa9931d561 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21329,7 +21329,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22037,7 +22037,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22233,7 +22233,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22302,7 +22302,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22417,7 +22417,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22476,7 +22476,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22650,7 +22650,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22841,7 +22841,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25281,7 +25281,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25613,7 +25613,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25663,7 +25663,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25913,7 +25913,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26543,7 +26543,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27608,7 +27608,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45467,7 +45467,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From f7eaa0ed078bedd4b1508de5f11d1f23f5f58338 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 17 Feb 2020 16:46:25 -0800
Subject: [PATCH 105/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 295636635 Change-Id:
 I571666fab83a7a056be4c3b4100853b712e8bdd9

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index ffa9931d561..86be1ef98aa 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21329,7 +21329,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22037,7 +22037,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22233,7 +22233,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22302,7 +22302,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22417,7 +22417,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22476,7 +22476,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22650,7 +22650,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22841,7 +22841,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25281,7 +25281,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25613,7 +25613,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25663,7 +25663,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25913,7 +25913,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26543,7 +26543,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27608,7 +27608,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45467,7 +45467,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 192a4f071d06b6f801845a77de44fd49b173b9a9 Mon Sep 17 00:00:00 2001
From: Pallavi G <pallavi.g@intel.com>
Date: Fri, 14 Feb 2020 13:20:33 +0800
Subject: [PATCH 106/442] [INTEL MKL] DNN1.0 integration - concat op

---
 tensorflow/core/kernels/mkl_concat_op.cc | 172 +++++++++++++++++------
 1 file changed, 130 insertions(+), 42 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 8470a7e2728..aa281254922 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <vector>
 
 #include "mkldnn.hpp"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -30,7 +29,9 @@ limitations under the License.
 #include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 using mkldnn::concat;
 using mkldnn::stream;
@@ -183,13 +184,12 @@ class EigenConcatBaseOp : public OpKernel {
       const auto in = values[i];
       const bool in_is_scalar = TensorShapeUtils::IsScalar(input_shapes[i]);
       OP_REQUIRES(
-          c,
-          (input_shapes[i].dims() == input_dims) ||
-              (input_is_scalar && in_is_scalar),
+          c, (input_shapes[i].dims() == input_dims) ||
+                 (input_is_scalar && in_is_scalar),
           errors::InvalidArgument(
               "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
-              input_shape.DebugString(), " vs. shape[", i,
-              "] = ", input_shapes[i].DebugString()));
+              input_shape.DebugString(), " vs. shape[", i, "] = ",
+              input_shapes[i].DebugString()));
       if (in.NumElements() > 0) {
         int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
         inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
@@ -240,11 +240,11 @@ struct MklConcatFwdParams {
   memory::dims dst_dims;
   int num_inputs;
   int concat_dims;
-  memory::format mkl_common_format;
+  MEMORY_FORMAT mkl_common_format;
 
   MklConcatFwdParams(std::vector<memory::dims>& src_dims_pt,
                      memory::dims dst_dims, int num_inputs, int concat_dims,
-                     memory::format mkl_common_format)
+                     MEMORY_FORMAT mkl_common_format)
       : dst_dims(dst_dims),
         num_inputs(num_inputs),
         concat_dims(concat_dims),
@@ -264,8 +264,8 @@ class MklConcatFwdPrimitive : public MklPrimitive {
  public:
   explicit MklConcatFwdPrimitive(const MklConcatFwdParams& concat_fwd_dims,
                                  const std::vector<memory::desc>& srcs_md)
-      : cpu_engine_(engine::cpu, 0) {
-    context_.fwd_stream.reset(new stream(stream::kind::eager));
+      : cpu_engine_(ENGINE_CPU, 0) {
+    context_.fwd_stream.reset(new CPU_STREAM(stream::kind::eager));
     // Create concat primitive
     Setup(concat_fwd_dims, srcs_md);
   }
@@ -290,7 +290,16 @@ class MklConcatFwdPrimitive : public MklPrimitive {
       context_.data_mem[i] = *context_.data_mem_shdptr[i];
     }
 
+#ifdef ENABLE_MKLDNN_V1
+    DCHECK_EQ(context_.fwd_primitives.size(),
+              context_.fwd_primitives_args.size());
+    for (size_t i = 0; i < context_.fwd_primitives.size(); ++i) {
+      context_.fwd_primitives.at(i).execute(*context_.fwd_stream,
+                                            context_.fwd_primitives_args.at(i));
+    }
+#else
     context_.fwd_stream->submit(context_.fwd_primitives);
+#endif  // ENABLE_MKLDNN_V1
 
     // After exec, set data handle back
     context_.dst_mem->set_data_handle(DummyData);
@@ -306,12 +315,18 @@ class MklConcatFwdPrimitive : public MklPrimitive {
  private:
   // Primitive reuse context for concat Fwd op
   struct ConcatFwdContext {
+#ifndef ENABLE_MKLDNN_V1
     std::vector<mkldnn::memory::primitive_desc> src_pd;
     std::vector<std::shared_ptr<mkldnn::memory::primitive_desc>> src_pd_shdptr;
     std::shared_ptr<mkldnn::memory::primitive_desc> dst_pd;
+#endif  // ENABLE_MKLDNN_V1
 
-    // MKL-DNN memory
+// MKL-DNN memory
+#ifdef ENABLE_MKLDNN_V1
+    std::vector<mkldnn::memory> data_mem;
+#else
     std::vector<mkldnn::primitive::at> data_mem;
+#endif  // ENABLE_MKLDNN_V1
     std::vector<std::shared_ptr<mkldnn::memory>> data_mem_shdptr;
     std::shared_ptr<mkldnn::memory> dst_mem;
 
@@ -326,6 +341,10 @@ class MklConcatFwdPrimitive : public MklPrimitive {
     std::shared_ptr<mkldnn::stream> fwd_stream;
     std::vector<mkldnn::primitive> fwd_primitives;
 
+#ifdef ENABLE_MKLDNN_V1
+    std::vector<std::unordered_map<int, memory>> fwd_primitive_args;
+#endif  // ENABLE_MKLDNN_V1
+
     ConcatFwdContext()
         : dst_mem(nullptr),
           fwd_pd(nullptr),
@@ -342,35 +361,61 @@ class MklConcatFwdPrimitive : public MklPrimitive {
       std::shared_ptr<mkldnn::memory::desc> source_md(
           new memory::desc(srcs_md[i].data));
       context_.src_md.push_back(source_md);
-
+#ifdef ENABLE_MKLDNN_V1
+      std::shared_ptr<mkldnn::memory> src_mem(
+          new mkldnn::memory(*source_md, cpu_engine_, DummyData));
+#else
       std::shared_ptr<mkldnn::memory::primitive_desc> src_mpd(
           new memory::primitive_desc(*source_md, cpu_engine_));
       context_.src_pd_shdptr.push_back(src_mpd);
 
       std::shared_ptr<mkldnn::memory> src_mem(
           new mkldnn::memory(*src_mpd, DummyData));
-      context_.data_mem_shdptr.push_back(src_mem);
-
-      context_.data_mem.push_back(*context_.data_mem_shdptr[i]);
       context_.src_pd.push_back(*context_.src_pd_shdptr[i]);
+#endif  // ENABLE_MKLDNN_V1
+      context_.data_mem_shdptr.push_back(src_mem);
+      context_.data_mem.push_back(*context_.data_mem_shdptr[i]);
     }
-    // Create a concat primitive descriptor
+// Create a concat primitive descriptor
+#ifdef ENABLE_MKLDNN_V1
+    context_.fwd_pd.reset(new concat::primitive_desc(
+        concat_fwd_dims.concat_dims, context_.src_md, cpu_engine_));
+#else
     context_.fwd_pd.reset(new concat::primitive_desc(
         concat_fwd_dims.concat_dims, context_.src_pd));
+#endif  // ENABLE_MKLDNN_V1
 
     // Store the expected memory format
     context_.dst_md.reset(new memory::desc({concat_fwd_dims.dst_dims},
                                            MklDnnType<T>(),
                                            concat_fwd_dims.mkl_common_format));
+#ifdef ENABLE_MKLDNN_V1
+    // Create memory primitive based on dummy data
+    context_.dst_mem.reset(
+        new memory(*context_.dst_md, cpu_engine_, DummyData));
+#else
     context_.dst_pd.reset(
         new memory::primitive_desc(*context_.dst_md, cpu_engine_));
 
     // Create memory primitive based on dummy data
     context_.dst_mem.reset(new memory(*context_.dst_pd, DummyData));
+#endif  // ENABLE_MKLDNN_V1
 
+#ifdef ENABLE_MKLDNN_V1
+    context_.concat_fwd.reset(new concat(*context_.fwd_pd));
+    std::unordered_map<int, memory> net_args = {
+        { MKLDNN_ARG_DST,
+          *context_.dst_mem }};
+    for (int i = 0; i < concat_fwd_dims.num_inputs; ++i) {
+      net_args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, context_.data_mem[i]});
+    }
+
+    context_.fwd_primitives_args.push_back(net_args);
+#else
     // Create concat primitive
     context_.concat_fwd.reset(
         new concat(*context_.fwd_pd, context_.data_mem, *context_.dst_mem));
+#endif  // ENABLE_MKLDNN_V1
 
     context_.fwd_primitives.push_back(*context_.concat_fwd);
   }
@@ -456,7 +501,7 @@ class MklConcatOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     try {
-      auto cpu_engine = engine(engine::cpu, 0);
+      auto cpu_engine = engine(ENGINE_CPU, 0);
       OpInputList input_tensors;
       GetMklInputList(context, "values", &input_tensors);
       const int N = input_tensors.size();
@@ -586,13 +631,17 @@ class MklConcatOp : public OpKernel {
         // output format that is same as input formats.
         dst_dims = TFShapeToMklDnnDims(input_tensors[0].shape());
 
-      std::vector<memory::primitive_desc> srcs_pd;
+      std::vector<MEMORY_PRIMITIVE_DESC> srcs_pd;
       std::vector<MklDnnData<T>> srcs(N, MklDnnData<T>(&cpu_engine));
       int64 dst_concat_dim_size = 0;
 
       bool isMklReorderNeeded = false;
-      memory::format mkl_common_format = memory::format::any;
+      MEMORY_FORMAT mkl_common_format = MEMORY_FORMAT::any;
+#ifdef ENABLE_MKLDNN_V1
+      std::vector<memory> inputs;
+#else
       std::vector<primitive::at> inputs;
+#endif  // ENABLE_MKLDNN_V1
       std::vector<memory::dims> src_dims_pt;
       std::vector<mkldnn::memory> srcs_mem;
       std::vector<memory::desc> srcs_md;
@@ -608,7 +657,11 @@ class MklConcatOp : public OpKernel {
             if (input_tensors[k].NumElements() == 0) continue;
             auto src_md = mkl_input_shapes[k].GetMklLayout();
             srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+#ifdef ENABLE_MKLDNN_V1
+            auto src_mpd = srcs[k].GetUsrMemDesc();
+#else
             auto src_mpd = srcs[k].GetUsrMemPrimDesc();
+#endif  // ENABLE_MKLDNN_V1
             srcs_pd.push_back(src_mpd);
             inputs.push_back(srcs[k].GetOpMem());
           }
@@ -626,8 +679,11 @@ class MklConcatOp : public OpKernel {
               src_md =
                   memory::desc(src_dims, MklDnnType<T>(), mkl_common_format);
             }
-
+#ifdef ENABLE_MKLDNN_V1
+            srcs_pd.push_back(memory::desc(src_md));
+#else
             srcs_pd.push_back(memory::primitive_desc(src_md, cpu_engine));
+#endif  // ENABLE_MKLDNN_V1
           }
         }
       } else {  // All TF inputs
@@ -641,15 +697,19 @@ class MklConcatOp : public OpKernel {
           // It does not matter what data format to be used (NHWC versus NCHW).
           // We just need to ensure that output uses same data format as inputs.
           if (s_dims == 4)
-            mkl_common_format = memory::format::nchw;
+            mkl_common_format = MEMORY_FORMAT::nchw;
           else if (s_dims == 2)
-            mkl_common_format = memory::format::nc;
+            mkl_common_format = MEMORY_FORMAT::nc;
 
           auto src_md =
               memory::desc(src_dims, MklDnnType<T>(), mkl_common_format);
 
           srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+#ifdef ENABLE_MKLDNN_V1
+          auto src_mpd = srcs[k].GetUsrMemDesc();
+#else
           auto src_mpd = srcs[k].GetUsrMemPrimDesc();
+#endif  // ENABLE_MKLDNN_V1
           srcs_pd.push_back(src_mpd);
           inputs.push_back(srcs[k].GetOpMem());
           src_dims_pt.push_back(src_dims);
@@ -660,7 +720,7 @@ class MklConcatOp : public OpKernel {
       dst_dims[concat_dim] = dst_concat_dim_size;
 
       MklDnnData<T> dst(&cpu_engine);
-      memory::desc dst_md({}, memory::data_undef, memory::format_undef);
+      memory::desc dst_md({}, MEMORY_DATA_TYPE_UNDEF, MEMORY_FORMAT_UNDEF);
       memory::dims dst_dims_in_nchw;
       if (are_all_mkl_inputs) {
         // Since we are passing a specific format for destination,
@@ -669,19 +729,27 @@ class MklConcatOp : public OpKernel {
         if (dst_dims.size() == 4) {
           dst_dims_in_nchw = MklDnnDimsInNCHW(
               dst_dims, MklDnnDataFormatToTFDataFormat(orig_tf_format));
-          // Set the output format same as the most common format of inputs
-          // to avoid layout conversions.
-          if (mkl_common_format == memory::format::blocked) {
-            VLOG(1) << "mkl_common_format == memory::format::blocked";
+// Set the output format same as the most common format of inputs
+// to avoid layout conversions.
+#ifdef ENABLE_MKLDNN_V1
+          // DNN 1.0: internal format is always blocked;
+          //          format_tag does not have "blocked" field.
+          VLOG(1) << "mkl_common_format == MEMORY_FORMAT::blocked";
+          dst_md = MklDnnData<T>::CreateBlockedMemDesc(
+              dst_dims_in_nchw, CalculateTFStrides(dst_dims_in_nchw));
+#else
+          if (mkl_common_format == MEMORY_FORMAT::blocked) {
+            VLOG(1) << "mkl_common_format == MEMORY_FORMAT::blocked";
             dst_md = MklDnnData<T>::CreateBlockedMemDesc(
                 dst_dims_in_nchw, CalculateTFStrides(dst_dims_in_nchw));
           } else {
             dst_md = memory::desc(dst_dims_in_nchw, MklDnnType<T>(),
                                   mkl_common_format);
           }
+#endif  // ENABLE_MKLDNN_V1
         } else if (dst_dims.size() == 2 &&
-                   mkl_common_format == memory::format::nc) {
-          // When memory::format::nc, dst_dims are already in MKL-DNN order
+                   mkl_common_format == MEMORY_FORMAT::nc) {
+          // When MEMORY_FORMAT::nc, dst_dims are already in MKL-DNN order
           dst_md = memory::desc(dst_dims, MklDnnType<T>(), mkl_common_format);
         } else {
           TF_CHECK_OK(Status(error::Code::FAILED_PRECONDITION,
@@ -697,7 +765,11 @@ class MklConcatOp : public OpKernel {
       if (isMklReorderNeeded) {
         for (int k = 0; k < input_tensors.size(); k++) {
           if (input_tensors[k].NumElements() > 0) {
+#ifdef ENABLE_MKLDNN_V1
+            srcs[k].CheckReorderToOpMem(srcs_pd[k], cpu_engine);
+#else
             srcs[k].CheckReorderToOpMem(srcs_pd[k]);
+#endif  // ENABLE_MKLDNN_V1
             inputs.push_back(srcs[k].GetOpMem());
           }
         }
@@ -715,8 +787,13 @@ class MklConcatOp : public OpKernel {
 
       if (!inputs.empty()) {
         if (are_all_mkl_inputs) {
+#ifdef ENABLE_MKLDNN_V1
+          auto concat_pd =
+              concat::primitive_desc(concat_dim, srcs_pd, cpu_engine);
+#else
           auto concat_pd = concat::primitive_desc(concat_dim, srcs_pd);
-          auto dst_pd = concat_pd.dst_primitive_desc();
+#endif  // ENABLE_MKLDNN_V1
+          auto dst_pd = concat_pd.PRIMITIVE_DESC_DST;
 
           MklDnnShape dnn_shape_dst;
           TensorShape tf_shape_dst;
@@ -734,11 +811,22 @@ class MklConcatOp : public OpKernel {
           if (dnn_shape_dst.IsMklTensor())
             dst_md = dnn_shape_dst.GetMklLayout();
           dst.SetUsrMem(dst_md, dst_tensor);
-
+          stream concat_stream = CPU_STREAM(cpu_engine);
+#ifdef ENABLE_MKLDNN_V1
+          auto concat_op = concat(concat_pd);
+          std::unordered_map<int, memory> net_args = {
+              { MKLDNN_ARG_DST,
+                dst.GetOpMem() }};
+          for (int i = 0; i < inputs.size(); ++i) {
+            net_args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, inputs[i]});
+          }
+          concat_op.execute(concat_stream, net_args);
+#else
           auto concat_op = concat(concat_pd, inputs, dst.GetOpMem());
           std::vector<primitive> net;
           net.push_back(concat_op);
-          stream(stream::kind::eager).submit(net).wait();
+          concat_stream.submit(net).wait();
+#endif  // ENABLE_MKLDNN_V1
         } else {
           MklConcatFwdPrimitive<T>* concat_fwd = nullptr;
 
@@ -795,9 +883,9 @@ class MklConcatOp : public OpKernel {
         DCHECK(dst_tensor != nullptr) << "Output tensor pointer is NULL";
       }
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           context,
           errors::Aborted("Operation received an exception:", error_msg));
@@ -856,13 +944,13 @@ class MklConcatOp : public OpKernel {
   //   2. concat_dim_size is the size of concat_dim.
   // Return:
   //   return the common MKL format.
-  memory::format FindMklCommonFormat(const MklDnnShapeList& input_shapes,
-                                     int concat_dim, bool* is_reorder_needed,
-                                     int64* concat_dim_size) {
+  MEMORY_FORMAT FindMklCommonFormat(const MklDnnShapeList& input_shapes,
+                                    int concat_dim, bool* is_reorder_needed,
+                                    int64* concat_dim_size) {
     *is_reorder_needed = false;
     *concat_dim_size = 0;
     std::unordered_map<int, int> occurrence_map;
-    if (input_shapes.size() == 0) return memory::format::any;
+    if (input_shapes.size() == 0) return MEMORY_FORMAT::any;
 
     // Compute ocurrences of each format of all inputs.
     for (int k = 0; k < input_shapes.size(); k++) {
@@ -875,19 +963,19 @@ class MklConcatOp : public OpKernel {
     if (occurrence_map.size() == 1) {
       // this means that all inputs have a same format
       // return it with is_reorder_needed set false.
-      return static_cast<memory::format>(
+      return static_cast<MEMORY_FORMAT>(
           input_shapes[0].GetMklLayout().data.format);
     }
 
     // Input tensors have different formats. Thus, reorder is needed.
     // We pick up the most common format to minimize the total
     // number of input reorder.
-    memory::format commonest_format = memory::format::any;
+    MEMORY_FORMAT commonest_format = MEMORY_FORMAT::any;
     int max_occurrence = 0;
     *is_reorder_needed = true;
     for (auto item : occurrence_map) {
       if (item.second > max_occurrence) {
-        commonest_format = static_cast<memory::format>(item.first);
+        commonest_format = static_cast<MEMORY_FORMAT>(item.first);
         max_occurrence = item.second;
       }
     }

From f11a059c655058cc9134395e41310779205f840b Mon Sep 17 00:00:00 2001
From: Pallavi G <pallavi.g@intel.com>
Date: Mon, 17 Feb 2020 13:26:25 +0800
Subject: [PATCH 107/442] Address the review comments

---
 tensorflow/core/kernels/mkl_concat_op.cc | 44 ++++++------------------
 tensorflow/core/util/mkl_util.h          | 16 ++++++---
 2 files changed, 22 insertions(+), 38 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index aa281254922..d0e5ba69560 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -291,12 +291,8 @@ class MklConcatFwdPrimitive : public MklPrimitive {
     }
 
 #ifdef ENABLE_MKLDNN_V1
-    DCHECK_EQ(context_.fwd_primitives.size(),
-              context_.fwd_primitives_args.size());
-    for (size_t i = 0; i < context_.fwd_primitives.size(); ++i) {
-      context_.fwd_primitives.at(i).execute(*context_.fwd_stream,
-                                            context_.fwd_primitives_args.at(i));
-    }
+    execute_primitives(context_.fwd_primitives, *context_.fwd_stream,
+                       context_.fwd_primitives_args.at(i));
 #else
     context_.fwd_stream->submit(context_.fwd_primitives);
 #endif  // ENABLE_MKLDNN_V1
@@ -319,7 +315,7 @@ class MklConcatFwdPrimitive : public MklPrimitive {
     std::vector<mkldnn::memory::primitive_desc> src_pd;
     std::vector<std::shared_ptr<mkldnn::memory::primitive_desc>> src_pd_shdptr;
     std::shared_ptr<mkldnn::memory::primitive_desc> dst_pd;
-#endif  // ENABLE_MKLDNN_V1
+#endif  // !ENABLE_MKLDNN_V1
 
 // MKL-DNN memory
 #ifdef ENABLE_MKLDNN_V1
@@ -657,11 +653,7 @@ class MklConcatOp : public OpKernel {
             if (input_tensors[k].NumElements() == 0) continue;
             auto src_md = mkl_input_shapes[k].GetMklLayout();
             srcs[k].SetUsrMem(src_md, &input_tensors[k]);
-#ifdef ENABLE_MKLDNN_V1
-            auto src_mpd = srcs[k].GetUsrMemDesc();
-#else
-            auto src_mpd = srcs[k].GetUsrMemPrimDesc();
-#endif  // ENABLE_MKLDNN_V1
+            auto src_mpd = GET_USR_MEM_PRIM_DESC(srcs[k]);
             srcs_pd.push_back(src_mpd);
             inputs.push_back(srcs[k].GetOpMem());
           }
@@ -679,11 +671,8 @@ class MklConcatOp : public OpKernel {
               src_md =
                   memory::desc(src_dims, MklDnnType<T>(), mkl_common_format);
             }
-#ifdef ENABLE_MKLDNN_V1
-            srcs_pd.push_back(memory::desc(src_md));
-#else
-            srcs_pd.push_back(memory::primitive_desc(src_md, cpu_engine));
-#endif  // ENABLE_MKLDNN_V1
+            srcs_pd.push_back(
+                MEMORY_PD_CONSTRUCTOR_2_PARAMS(src_md, cpu_engine));
           }
         }
       } else {  // All TF inputs
@@ -705,11 +694,7 @@ class MklConcatOp : public OpKernel {
               memory::desc(src_dims, MklDnnType<T>(), mkl_common_format);
 
           srcs[k].SetUsrMem(src_md, &input_tensors[k]);
-#ifdef ENABLE_MKLDNN_V1
-          auto src_mpd = srcs[k].GetUsrMemDesc();
-#else
-          auto src_mpd = srcs[k].GetUsrMemPrimDesc();
-#endif  // ENABLE_MKLDNN_V1
+          auto src_mpd = GET_USR_MEM_PRIM_DESC(srcs[k]);
           srcs_pd.push_back(src_mpd);
           inputs.push_back(srcs[k].GetOpMem());
           src_dims_pt.push_back(src_dims);
@@ -765,11 +750,8 @@ class MklConcatOp : public OpKernel {
       if (isMklReorderNeeded) {
         for (int k = 0; k < input_tensors.size(); k++) {
           if (input_tensors[k].NumElements() > 0) {
-#ifdef ENABLE_MKLDNN_V1
-            srcs[k].CheckReorderToOpMem(srcs_pd[k], cpu_engine);
-#else
-            srcs[k].CheckReorderToOpMem(srcs_pd[k]);
-#endif  // ENABLE_MKLDNN_V1
+            srcs[k].CheckReorderToOpMem(
+                MEMORY_PD_WITHOUT_DATA(srcs_pd[k], cpu_engine));
             inputs.push_back(srcs[k].GetOpMem());
           }
         }
@@ -787,12 +769,8 @@ class MklConcatOp : public OpKernel {
 
       if (!inputs.empty()) {
         if (are_all_mkl_inputs) {
-#ifdef ENABLE_MKLDNN_V1
-          auto concat_pd =
-              concat::primitive_desc(concat_dim, srcs_pd, cpu_engine);
-#else
-          auto concat_pd = concat::primitive_desc(concat_dim, srcs_pd);
-#endif  // ENABLE_MKLDNN_V1
+          auto concat_pd = concat::primitive_desc(
+              concat_dim, MEMORY_PD_WITHOUT_DATA(srcs_pd, cpu_engine));
           auto dst_pd = concat_pd.PRIMITIVE_DESC_DST;
 
           MklDnnShape dnn_shape_dst;
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index a782e76547b..5e5416ee645 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -732,9 +732,9 @@ inline Status ConvertMklToTF(OpKernelContext* context,
     }
     return Status::OK();
   } catch (mkldnn::error& e) {
-    string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + string(e.message) + ", in file " +
-                       string(__FILE__) + ":" + std::to_string(__LINE__);
+    string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                       string(e.message) + ", in file " + string(__FILE__) +
+                       ":" + std::to_string(__LINE__);
     LOG(FATAL) << "Operation received an exception: " << error_msg;
   }
 }
@@ -1254,8 +1254,8 @@ inline Status CreateBlockedMemDescHelper(const memory::dims& dim,
   } catch (mkldnn::error& e) {
     return Status(error::Code::INTERNAL,
                   tensorflow::strings::StrCat(
-                      "Failed to create blocked memory descriptor.",
-                      "Status: ", e.status, ", message: ", e.message));
+                      "Failed to create blocked memory descriptor.", "Status: ",
+                      e.status, ", message: ", e.message));
   }
 #else
   // We have to construct memory descriptor in a C style. This is not at all
@@ -2162,6 +2162,12 @@ void execute_primitives(
 }
 #endif  // ENABLE_MKLDNN_V1
 
+#ifdef ENABLE_MKLDNN_V1
+#define GET_USR_MEM_PRIM_DESC(src) src.GetUsrMemDesc()
+#else
+#define GET_USR_MEM_PRIM_DESC(src) src.GetUsrMemPrimDesc()
+#endif  // ENABLE_MKLDNN_V1
+
 }  // namespace tensorflow
 #endif  // INTEL_MKL
 #endif  // TENSORFLOW_CORE_UTIL_MKL_UTIL_H_

From ca585e7b558b83f7b687b46c79493ee26dc58488 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 17 Feb 2020 22:46:42 -0800
Subject: [PATCH 108/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 295667829 Change-Id:
 I49d45c1a4c6900a709c85082698843991056960b

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 86be1ef98aa..ffa9931d561 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21329,7 +21329,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22037,7 +22037,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22233,7 +22233,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22302,7 +22302,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22417,7 +22417,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22476,7 +22476,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22650,7 +22650,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22841,7 +22841,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25281,7 +25281,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25613,7 +25613,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25663,7 +25663,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25913,7 +25913,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26543,7 +26543,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27608,7 +27608,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45467,7 +45467,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 7ba12b96e37a8cbb1dc6ddc97e00f203f7cb2950 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 01:02:22 -0800
Subject: [PATCH 109/442] compat: Update forward compatibility horizon to
 2020-02-18

PiperOrigin-RevId: 295681292
Change-Id: I2e8533cf07ca39d73086ee521efee1c55c69b415
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 1dae10ae638..e889b989ce0 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -31,7 +31,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 2, 17)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 2, 18)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 669dc0c76a6b271a98047d522cf131eebfca1d08 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Tue, 5 Nov 2019 16:32:20 -0800
Subject: [PATCH 110/442] Add allow_build_at_runtime option

---
 tensorflow/compiler/tf2tensorrt/BUILD         |  2 +
 .../tf2tensorrt/convert/convert_graph.cc      |  2 +
 .../tf2tensorrt/convert/convert_graph.h       |  1 +
 .../tf2tensorrt/convert/convert_nodes.h       |  4 +-
 .../convert/trt_optimization_pass.cc          |  4 ++
 .../convert/trt_optimization_pass.h           |  4 +-
 .../tf2tensorrt/kernels/trt_engine_op.cc      | 23 +++++++++-
 .../tf2tensorrt/kernels/trt_engine_op_test.cc | 32 +++++++++++++-
 .../python/compiler/tensorrt/trt_convert.py   | 43 +++++++++++++++++--
 9 files changed, 106 insertions(+), 9 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index a55ca56e551..82b682ed7a4 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -189,6 +189,8 @@ tf_cuda_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:array",
     ] + if_tensorrt([
         "@local_config_cuda//cuda:cuda_headers",
     ]),
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index 0131d45f815..f17361fb211 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -468,6 +468,7 @@ Status CreateTRTNode(const ConversionParams& params,
           .Attr("precision_mode", prec_string)
           .Attr("use_calibration", info.use_calibration)
           .Attr("_use_implicit_batch", params.use_implicit_batch)
+          .Attr("_allow_build_at_runtime", info.allow_build_at_runtime)
           .Attr("OutT", out_types)
           .Finalize(&trt_node);
   if (!status.ok()) {
@@ -671,6 +672,7 @@ Status ConvertAfterShapes(const ConversionParams& params) {
                                    : EngineInfo::EngineType::TRTStatic);
     curr_engine.use_calibration = params.use_calibration;
     curr_engine.maximum_cached_engines = params.max_cached_engines;
+    curr_engine.allow_build_at_runtime = params.allow_build_at_runtime;
 
     status = RegisterGraphToFunctionLibrary(curr_engine.segment_graph_def,
                                             &graph, curr_engine.engine_name);
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
index 00dc4c72f43..2bfaa2a786c 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
@@ -49,6 +49,7 @@ struct ConversionParams {
   int max_cached_engines = 1;
   bool use_calibration = true;
   bool use_implicit_batch = true;
+  bool allow_build_at_runtime = true;
 };
 
 // Method to call from optimization pass
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index d295f074a98..4375af8ad3f 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -92,7 +92,8 @@ struct EngineInfo {
       : engine_type(EngineType::TRTStatic),
         max_workspace_size_bytes(0),
         precision_mode(TrtPrecisionMode::FP32),
-        use_calibration(true) {}
+        use_calibration(true),
+        allow_build_at_runtime(true) {}
 
   string engine_name;
   string device;
@@ -109,6 +110,7 @@ struct EngineInfo {
   int maximum_cached_engines;
   TrtPrecisionMode precision_mode;
   bool use_calibration;
+  bool allow_build_at_runtime;
 };
 
 // Constructs a graphdef from the segment in the given graph. Adds _Arg
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
index 757ddd159c9..7995163ed44 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
@@ -70,6 +70,9 @@ Status TRTOptimizationPass::Init(
   if (params.count("trt_logger")) {
     trt_logger_name_ = params.at("trt_logger").s();
   }
+  if (params.count("allow_build_at_runtime")) {
+    allow_build_at_runtime_ = params.at("allow_build_at_runtime").b();
+  }
   if (params.count("use_implicit_batch")) {
     use_implicit_batch_ = params.at("use_implicit_batch").b();
   }
@@ -265,6 +268,7 @@ Status TRTOptimizationPass::Optimize(grappler::Cluster* cluster,
   cp.max_cached_engines = max_cached_batches_;
   cp.use_calibration = use_calibration_;
   cp.use_implicit_batch = use_implicit_batch_;
+  cp.allow_build_at_runtime = allow_build_at_runtime_;
   auto status = ConvertAfterShapes(cp);
   VLOG(1) << "Returning from " << name_;
   return status;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
index 3ce0d09b7c0..f79048bb5f6 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
@@ -42,7 +42,8 @@ class TRTOptimizationPass : public grappler::CustomGraphOptimizer {
         max_cached_batches_(1),
         max_workspace_size_bytes_(256LL << 20),
         use_calibration_(true),
-        use_implicit_batch_(true) {
+        use_implicit_batch_(true),
+        allow_build_at_runtime_(true) {
     VLOG(1) << "Constructing " << name_;
   }
 
@@ -75,6 +76,7 @@ class TRTOptimizationPass : public grappler::CustomGraphOptimizer {
   int64_t max_workspace_size_bytes_;
   bool use_calibration_;
   bool use_implicit_batch_;
+  bool allow_build_at_runtime_;
 };
 
 }  // namespace convert
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index 909e3e11006..b98e75527cc 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -157,6 +157,9 @@ class TRTEngineOp : public AsyncOpKernel {
   // Whether to use implicit batch dimension for TensorRT
   bool use_implicit_batch_;
 
+  // Whether to build TensorRT engines at runtime
+  bool allow_build_at_runtime_;
+
   // Maximum number of cached engines
   int max_cached_engines_;
 
@@ -281,6 +284,14 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
                  context->GetAttr("use_calibration", &use_calibration_));
   OP_REQUIRES_OK(context,
                  context->GetAttr("input_shapes", &input_partial_shapes_));
+  auto status =
+      context->GetAttr("_allow_build_at_runtime", &allow_build_at_runtime_);
+  if (status.code() == tensorflow::error::NOT_FOUND) {
+    VLOG(2) << "Not found _allow_build_at_runtime in "
+            << context->device()->name()
+            << ", thus setting _allow_build_at_runtime=true";
+    allow_build_at_runtime_ = true;
+  }
   func_handle_ = kInvalidHandle;
   if (!static_engine_) {
     FunctionLibraryRuntime* lib = context->function_library();
@@ -302,7 +313,7 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
   OP_REQUIRES_OK(context, context->GetAttr("max_cached_engines_count",
                                            &max_cached_engines_));
 
-  auto status = context->GetAttr("_use_implicit_batch", &use_implicit_batch_);
+  status = context->GetAttr("_use_implicit_batch", &use_implicit_batch_);
   if (status.code() == tensorflow::error::NOT_FOUND) {
     VLOG(2) << "Not found _use_implicit_batch in " << context->device()->name()
             << ", thus setting _use_implicit_batch=true";
@@ -957,6 +968,16 @@ StatusOr<EngineContext*> TRTEngineOp::GetEngine(
   // If matched, use that engine. Otherwise, we will look in cache for that
   // exact shape and possibly create a new engine if it is not in cache.
   if (!cache.count(engine_input_shapes)) {
+    if (!allow_build_at_runtime_) {
+      LOG(WARNING) << "Found no engine in cache matching input shapes. "
+                   << "Not building a new engine because "
+                   << "allow_build_at_runtime=False. "
+                   << "The native segment will be used instead.";
+      // Store an empty engine in the cache for these input shapes so we don't
+      // try to build the same failing engine again.
+      cache.emplace(engine_input_shapes, absl::make_unique<EngineContext>());
+      return &empty_context;
+    }
     TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
     bool convert_successfully = false;
     LOG(INFO) << "Building a new TensorRT engine for " << name()
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
index a88f2b5e29e..2cf20e443fb 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/math_ops.h"
@@ -49,6 +48,7 @@ limitations under the License.
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/public/version.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
@@ -62,7 +62,8 @@ class TRTEngineOpTestBase : public OpsTestBase {
  public:
   void AddSimpleTrtOp(DataType dtype, int max_cached_engines_count = 1,
                       PartialTensorShape shape = PartialTensorShape({-1, -1}),
-                      bool use_implicit_batch = true) {
+                      bool use_implicit_batch = true,
+                      bool allow_build_at_runtime = true) {
     // Create the GPU device.
     std::unique_ptr<Device> device(
         DeviceFactory::NewDevice("GPU", {}, "/job:worker/replica:0/task:0"));
@@ -104,6 +105,7 @@ class TRTEngineOpTestBase : public OpsTestBase {
                      .Attr("precision_mode", "FP32")
                      .Attr("use_calibration", false)
                      .Attr("_use_implicit_batch", use_implicit_batch)
+                     .Attr("_allow_build_at_runtime", allow_build_at_runtime)
                      .Attr("OutT", {dtype})
                      .Finalize(OpsTestBase::node_def()));
     TF_ASSERT_OK(InitOpWithFunctionLibrary());
@@ -186,6 +188,32 @@ TEST_F(TRTEngineOpTestBase, DynamicEngines) {
   EXPECT_EQ(1, cache->count({TensorShape({10, 10})}));
 }
 
+TEST_F(TRTEngineOpTestBase, AllowBuildAtRuntime) {
+  TRTEngineOpTestBase::AddSimpleTrtOp(DT_FLOAT, /*max_cached_engines_count=*/1,
+                                      PartialTensorShape({-1, -1}),
+                                      /*use_implicit_batch=*/true,
+                                      /*allow_build_at_runtime=*/false);
+
+  // Execute the op
+  TensorShape input_shape({2, 2});
+  TRTEngineOpTestBase::AddSimpleInput<float>(input_shape);
+  TF_ASSERT_OK(OpsTestBase::RunOpKernel());
+
+  // Get the engine cache.
+  TRTEngineCacheResource* cache_resource = nullptr;
+  TF_ASSERT_OK(
+      device_->resource_manager()->Lookup("TF-TRT", "myop", &cache_resource));
+  core::ScopedUnref sc(cache_resource);
+
+  // It should contain a placeholder with an empty cuda_engine (to mark that
+  // engine creation was not successful for the given input shape).
+  auto cache = &cache_resource->cache_;
+  EXPECT_EQ(1, cache->size());
+  ASSERT_EQ(1, cache->count({input_shape}));
+  EngineContext* ectx = cache->at({input_shape}).get();
+  EXPECT_EQ(ectx->cuda_engine, nullptr);
+}
+
 TEST_F(TRTEngineOpTestBase, ExplicitBatch) {
   // Test inference in explicit batch mode with static input shapes. Static
   // shapes in this context means that the TensorRT knows all the input shapes
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py
index 2ea22ebba49..f56f7a9b5d0 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert.py
@@ -116,7 +116,7 @@ DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES = 1 << 30
 class TrtConversionParams(collections.namedtuple("TrtConversionParams", [
     "rewriter_config_template", "max_workspace_size_bytes", "precision_mode",
     "minimum_segment_size", "is_dynamic_op", "maximum_cached_engines",
-    "use_calibration", "max_batch_size"])):
+    "use_calibration", "max_batch_size", "allow_build_at_runtime"])):
   """Parameters that are used for TF-TRT conversion.
 
   Fields:
@@ -151,6 +151,11 @@ class TrtConversionParams(collections.namedtuple("TrtConversionParams", [
       tensors were trained with fake quantization.
     max_batch_size: max size for the input batch. This parameter is only
       effective when is_dynamic_op=False which is not supported in TF 2.0.
+    allow_build_at_runtime: whether to build TensorRT engines during runtime.
+      If no TensorRT engine can be found in cache that can handle the given
+      inputs during runtime, then a new TensorRT engine is built at runtime if
+      allow_build_at_runtime=True, and otherwise native TF is used. This
+      argument is only effective if is_dynamic_op=True.
   """
 
   def __new__(cls,
@@ -161,11 +166,12 @@ class TrtConversionParams(collections.namedtuple("TrtConversionParams", [
               is_dynamic_op=True,
               maximum_cached_engines=1,
               use_calibration=True,
-              max_batch_size=1):
+              max_batch_size=1,
+              allow_build_at_runtime=True):
     return super(TrtConversionParams, cls).__new__(
         cls, rewriter_config_template, max_workspace_size_bytes, precision_mode,
         minimum_segment_size, is_dynamic_op, maximum_cached_engines,
-        use_calibration, max_batch_size)
+        use_calibration, max_batch_size, allow_build_at_runtime)
 
 
 DEFAULT_TRT_CONVERSION_PARAMS = TrtConversionParams()
@@ -228,6 +234,13 @@ def _check_conversion_params(conversion_params, is_v2=False):
           not trt_optimizer.parameter_map["is_dynamic_op"]):
         raise ValueError("Option is_dynamic_op=False is not supported "
                          "in TF 2.0, please set it to True instead.")
+  if (conversion_params.allow_build_at_runtime and
+      not conversion_params.is_dynamic_op):
+    tf_logging.warn((
+        "Building TensorRT engines at runtime is not supported "
+        "if is_dynamic_op=False, therefore assuming "
+        "allow_build_at_runtime=False. If building TensorRT engines "
+        "at runtime is desired, set is_dynamic_op=True."))
 
 
 def _check_trt_version_compatibility():
@@ -320,6 +333,8 @@ def get_tensorrt_rewriter_config(conversion_params,
     optimizer.parameter_map[
         "use_calibration"].b = conversion_params.use_calibration
     optimizer.parameter_map["is_dynamic_op"].b = conversion_params.is_dynamic_op
+    optimizer.parameter_map[
+        "allow_build_at_runtime"].b = conversion_params.allow_build_at_runtime
     if not is_v2:
       optimizer.parameter_map[
           "max_batch_size"].i = conversion_params.max_batch_size
@@ -505,7 +520,8 @@ class TrtGraphConverter(object):
         is_dynamic_op=is_dynamic_op,
         maximum_cached_engines=maximum_cached_engines,
         use_calibration=use_calibration,
-        max_batch_size=max_batch_size)
+        max_batch_size=max_batch_size,
+        allow_build_at_runtime=True)
     _check_conversion_params(self._conversion_params)
 
   def _run_conversion(self):
@@ -1165,6 +1181,25 @@ class TrtGraphConverterV2(object):
     signatures = {
         key: value for key, value in self._saved_model.signatures.items()
     }
+
+    # Set allow_build_at_runtime=False if asked by user.
+    # This attribute is set here because build() needs it to be True
+    # in order to build engines.
+    if not self._conversion_params.allow_build_at_runtime:
+      def _reset_allow_build_at_runtime(node):
+        node.attr["allow_build_at_runtime"].b = False
+      self._for_each_trt_node(self._converted_graph_def,
+                              _reset_allow_build_at_runtime)
+      # Rebuild the function since a node attribute changed above
+      reset_converted_func = wrap_function.function_from_graph_def(
+          self._converted_graph_def,
+          [tensor.name for tensor in self._converted_func.inputs],
+          [tensor.name for tensor in self._converted_func.outputs])
+      reset_converted_func.graph.structured_outputs = nest.pack_sequence_as(
+          self._converted_func.graph.structured_outputs,
+          reset_converted_func.graph.structured_outputs)
+      self._converted_func = reset_converted_func
+
     signatures[self._input_saved_model_signature_key] = self._converted_func
     save.save(self._saved_model, output_saved_model_dir, signatures)
 

From 3050e7ddd10ad1b09dca3b30d6fcf2441ca6cf4f Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Mon, 17 Feb 2020 21:08:06 +0100
Subject: [PATCH 111/442] Fix bad_function_call

---
 tensorflow/core/kernels/ops_testutil.cc | 6 +++++-
 tensorflow/core/kernels/ops_testutil.h  | 2 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/ops_testutil.cc b/tensorflow/core/kernels/ops_testutil.cc
index 3dab8bf2f50..614e184b0b2 100644
--- a/tensorflow/core/kernels/ops_testutil.cc
+++ b/tensorflow/core/kernels/ops_testutil.cc
@@ -71,6 +71,9 @@ OpsTestBase::OpsTestBase() : device_type_(DEVICE_CPU) {
   auto device = DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0");
   CHECK(device) << "Could not create CPU device";
 
+  thread_pool_ = absl::make_unique<thread::ThreadPool>(
+      Env::Default(), /*name=*/"default", /*num_threads=*/1);
+
   device_ = device.get();
   device_mgr_ = absl::make_unique<StaticDeviceMgr>(std::move(device));
 
@@ -104,7 +107,8 @@ void OpsTestBase::SetDevice(const DeviceType& device_type,
   device_mgr_ = absl::make_unique<StaticDeviceMgr>(std::move(device));
   pflr_ = absl::make_unique<ProcessFunctionLibraryRuntime>(
       device_mgr_.get(), Env::Default(), /*config=*/nullptr,
-      TF_GRAPH_DEF_VERSION, flib_def_.get(), OptimizerOptions());
+      TF_GRAPH_DEF_VERSION, flib_def_.get(), OptimizerOptions(),
+      thread_pool_.get());
 
   device_type_ = device_type;
 #ifdef GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/ops_testutil.h b/tensorflow/core/kernels/ops_testutil.h
index ab7b994d9d2..f6821e3c49c 100644
--- a/tensorflow/core/kernels/ops_testutil.h
+++ b/tensorflow/core/kernels/ops_testutil.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/platform/threadpool.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/function.h"
@@ -183,6 +184,7 @@ class OpsTestBase : public ::testing::Test {
 
   std::unique_ptr<FunctionLibraryDefinition> flib_def_;
   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
 
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(OpsTestBase);

From a637febd0003251dbe1b5159e19dc4e6a9b549ed Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Tue, 18 Feb 2020 11:42:41 +0100
Subject: [PATCH 112/442] Move GetNumberOfEngineInputs into ifdef block and fix
 style

---
 .../compiler/tf2tensorrt/convert/utils.cc     | 39 +++++++++----------
 .../compiler/tf2tensorrt/convert/utils.h      |  3 +-
 2 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.cc b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
index 4fe51047caf..2fb8902883e 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
@@ -133,6 +133,25 @@ string DebugString(const std::vector<TensorShape>& shapes) {
 string DebugString(const std::vector<PartialTensorShape>& shapes) {
   return PartialTensorShapeUtils::PartialShapeListString(shapes);
 }
+
+int GetNumberOfEngineInputs(const nvinfer1::ICudaEngine* engine) {
+  int n_bindings = engine->getNbBindings();
+  int n_input = 0;
+  for (int i = 0; i < n_bindings; i++) {
+    if (engine->bindingIsInput(i)) n_input++;
+  }
+  // According to TensorRT 7 doc: "If the engine has been built for K profiles,
+  // the first getNbBindings() / K bindings are used by profile number 0, the
+  // following getNbBindings() / K bindings are used by profile number 1 etc."
+  // Therefore, to get the number of input tensors, we need to divide by the
+  // the number of profiles.
+#if IS_TRT_VERSION_GE(6, 0, 0, 0)
+  int n_profiles = engine->getNbOptimizationProfiles();
+#else
+  int n_profiles = 1;
+#endif
+  return n_input / n_profiles;
+}
 #endif
 
 string GetLinkedTensorRTVersion() {
@@ -165,25 +184,5 @@ string GetLoadedTensorRTVersion() {
   return absl::StrCat(major, ".", minor, ".", patch);
 }
 
-int GetNumberOfEngineInputs(
-  const nvinfer1::ICudaEngine *engine) {
-  int n_bindings = engine->getNbBindings();
-  int n_input = 0;
-  for (int i=0; i < n_bindings; i++) {
-     if (engine->bindingIsInput(i)) n_input++;
-  }
-  // According to TensorRT 7 doc: "If the engine has been built for K profiles,
-  // the first getNbBindings() / K bindings are used by profile number 0, the
-  // following getNbBindings() / K bindings are used by profile number 1 etc."
-  // Therefore, to get the number of input tensors, we need to divide by the
-  // the number of profiles.
-#if IS_TRT_VERSION_GE(6, 0, 0, 0)
-  int n_profiles = engine->getNbOptimizationProfiles();
-#else
-  int n_profiles = 1;
-#endif
-  return n_input / n_profiles;
-}
-
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.h b/tensorflow/compiler/tf2tensorrt/convert/utils.h
index 40e446b131e..668620bb90a 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.h
@@ -110,8 +110,7 @@ string GetLoadedTensorRTVersion();
 // number of input tensors for the network. This can differ from the number of
 // input bindings, because the number of total input bindings equals the number
 // of profiles times the number of engine inputs.
-int GetNumberOfEngineInputs(const nvinfer1::ICudaEngine *engine);
-
+int GetNumberOfEngineInputs(const nvinfer1::ICudaEngine* engine);
 #endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
 
 }  // namespace tensorrt

From 1efab013a5d4143179d75aba8d7487c2cf9f9123 Mon Sep 17 00:00:00 2001
From: Dayeong Lee <dayeongl@google.com>
Date: Tue, 18 Feb 2020 03:06:21 -0800
Subject: [PATCH 113/442] Moves ProfilingListener to "profiling_listener.h",
 "profiling_listener.cc" for InternalBenchmarkTfLiteModel to instantiate this
 class.

PiperOrigin-RevId: 295696727
Change-Id: I6b37afc1188846e2be9634571bf886751a8e708e
---
 tensorflow/lite/tools/benchmark/BUILD         | 15 +++-
 .../tools/benchmark/benchmark_tflite_model.cc | 87 +-----------------
 .../tools/benchmark/profiling_listener.cc     | 89 +++++++++++++++++++
 .../lite/tools/benchmark/profiling_listener.h | 53 +++++++++++
 4 files changed, 156 insertions(+), 88 deletions(-)
 create mode 100644 tensorflow/lite/tools/benchmark/profiling_listener.cc
 create mode 100644 tensorflow/lite/tools/benchmark/profiling_listener.h

diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index f6d07a55c24..df3194ff7e6 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -110,6 +110,18 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "profiling_listener",
+    srcs = ["profiling_listener.cc"],
+    hdrs = ["profiling_listener.h"],
+    copts = common_copts,
+    deps = [
+        ":benchmark_model_lib",
+        "//tensorflow/lite/profiling:profile_summarizer",
+        "//tensorflow/lite/profiling:profiler",
+    ],
+)
+
 cc_library(
     name = "benchmark_tflite_model_lib",
     srcs = ["benchmark_tflite_model.cc"],
@@ -121,6 +133,7 @@ cc_library(
         "//conditions:default": [],
     }),
     deps = [
+        ":profiling_listener",
         ":benchmark_model_lib",
         ":benchmark_utils",
         ":delegate_provider_hdr",
@@ -134,8 +147,6 @@ cc_library(
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/experimental/ruy/profiler",
         "//tensorflow/lite/kernels:builtin_ops",
-        "//tensorflow/lite/nnapi:nnapi_util",
-        "//tensorflow/lite/profiling:profile_summarizer",
         "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/tools/evaluation:utils",
     ] + select({
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 403cb018509..064eca0022f 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -32,13 +32,11 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/op_resolver.h"
-#include "tensorflow/lite/profiling/buffered_profiler.h"
-#include "tensorflow/lite/profiling/profile_summarizer.h"
 #include "tensorflow/lite/string_util.h"
-#include "tensorflow/lite/tools/benchmark/benchmark_model.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_utils.h"
 #include "tensorflow/lite/tools/benchmark/delegate_provider.h"
 #include "tensorflow/lite/tools/benchmark/logging.h"
+#include "tensorflow/lite/tools/benchmark/profiling_listener.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
 
 void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
@@ -60,48 +58,6 @@ constexpr int kOpProfilingEnabledDefault = true;
 constexpr int kOpProfilingEnabledDefault = false;
 #endif
 
-// Dumps profiling events if profiling is enabled.
-class ProfilingListener : public BenchmarkListener {
- public:
-  ProfilingListener(Interpreter* interpreter, uint32_t max_num_entries,
-                    std::string csv_file_path = "")
-      : interpreter_(interpreter),
-        profiler_(max_num_entries),
-        run_summarizer_(!csv_file_path.empty()),
-        init_summarizer_(!csv_file_path.empty()),
-        csv_file_path_(csv_file_path) {
-    TFLITE_BENCHMARK_CHECK(interpreter);
-    interpreter_->SetProfiler(&profiler_);
-
-    // We start profiling here in order to catch events that are recorded during
-    // the benchmark run preparation stage where TFLite interpreter is
-    // initialized and model graph is prepared.
-    profiler_.Reset();
-    profiler_.StartProfiling();
-  }
-
-  void OnBenchmarkStart(const BenchmarkParams& params) override;
-
-  void OnSingleRunStart(RunType run_type) override;
-
-  void OnSingleRunEnd() override;
-
-  void OnBenchmarkEnd(const BenchmarkResults& results) override;
-
- private:
-  void WriteOutput(const std::string& header, const string& data,
-                   std::ostream* stream) {
-    (*stream) << header << std::endl;
-    (*stream) << data << std::endl;
-  }
-
-  Interpreter* interpreter_;
-  profiling::BufferedProfiler profiler_;
-  profiling::ProfileSummarizer run_summarizer_;
-  profiling::ProfileSummarizer init_summarizer_;
-  std::string csv_file_path_;
-};
-
 // Dumps ruy profiling events if the ruy profiler is enabled.
 class RuyProfileListener : public BenchmarkListener {
  public:
@@ -113,47 +69,6 @@ class RuyProfileListener : public BenchmarkListener {
   std::unique_ptr<ruy::profiler::ScopeProfile> ruy_profile_;
 };
 
-void ProfilingListener::OnBenchmarkStart(const BenchmarkParams& params) {
-  // At this point, we have completed the prepration for benchmark runs
-  // including TFLite interpreter initialization etc. So we are going to process
-  // profiling events recorded during this stage.
-  profiler_.StopProfiling();
-  auto profile_events = profiler_.GetProfileEvents();
-  init_summarizer_.ProcessProfiles(profile_events, *interpreter_);
-  profiler_.Reset();
-}
-
-void ProfilingListener::OnSingleRunStart(RunType run_type) {
-  if (run_type == REGULAR) {
-    profiler_.Reset();
-    profiler_.StartProfiling();
-  }
-}
-
-void ProfilingListener::OnBenchmarkEnd(const BenchmarkResults& results) {
-  std::ofstream output_file(csv_file_path_);
-  std::ostream* output_stream = nullptr;
-  if (output_file.good()) {
-    output_stream = &output_file;
-  }
-  if (init_summarizer_.HasProfiles()) {
-    WriteOutput("Profiling Info for Benchmark Initialization:",
-                init_summarizer_.GetOutputString(),
-                output_stream == nullptr ? &TFLITE_LOG(INFO) : output_stream);
-  }
-  if (run_summarizer_.HasProfiles()) {
-    WriteOutput("Operator-wise Profiling Info for Regular Benchmark Runs:",
-                run_summarizer_.GetOutputString(),
-                output_stream == nullptr ? &TFLITE_LOG(INFO) : output_stream);
-  }
-}
-
-void ProfilingListener::OnSingleRunEnd() {
-  profiler_.StopProfiling();
-  auto profile_events = profiler_.GetProfileEvents();
-  run_summarizer_.ProcessProfiles(profile_events, *interpreter_);
-}
-
 void RuyProfileListener::OnBenchmarkStart(const BenchmarkParams& params) {
   ruy_profile_.reset(new ruy::profiler::ScopeProfile);
 }
diff --git a/tensorflow/lite/tools/benchmark/profiling_listener.cc b/tensorflow/lite/tools/benchmark/profiling_listener.cc
new file mode 100644
index 00000000000..a04015219ea
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/profiling_listener.cc
@@ -0,0 +1,89 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/tools/benchmark/profiling_listener.h"
+
+#include <fstream>
+
+namespace tflite {
+namespace benchmark {
+
+ProfilingListener::ProfilingListener(Interpreter* interpreter,
+                                     uint32_t max_num_entries,
+                                     std::string csv_file_path)
+    : interpreter_(interpreter),
+      profiler_(max_num_entries),
+      run_summarizer_(!csv_file_path.empty()),
+      init_summarizer_(!csv_file_path.empty()),
+      csv_file_path_(csv_file_path) {
+  TFLITE_BENCHMARK_CHECK(interpreter);
+  interpreter_->SetProfiler(&profiler_);
+
+  // We start profiling here in order to catch events that are recorded during
+  // the benchmark run preparation stage where TFLite interpreter is
+  // initialized and model graph is prepared.
+  profiler_.Reset();
+  profiler_.StartProfiling();
+}
+
+void ProfilingListener::OnBenchmarkStart(const BenchmarkParams& params) {
+  // At this point, we have completed the preparation for benchmark runs
+  // including TFLite interpreter initialization etc. So we are going to process
+  // profiling events recorded during this stage.
+  profiler_.StopProfiling();
+  auto profile_events = profiler_.GetProfileEvents();
+  init_summarizer_.ProcessProfiles(profile_events, *interpreter_);
+  profiler_.Reset();
+}
+
+void ProfilingListener::OnSingleRunStart(RunType run_type) {
+  if (run_type == REGULAR) {
+    profiler_.Reset();
+    profiler_.StartProfiling();
+  }
+}
+
+void ProfilingListener::OnSingleRunEnd() {
+  profiler_.StopProfiling();
+  auto profile_events = profiler_.GetProfileEvents();
+  run_summarizer_.ProcessProfiles(profile_events, *interpreter_);
+}
+
+void ProfilingListener::OnBenchmarkEnd(const BenchmarkResults& results) {
+  std::ofstream output_file(csv_file_path_);
+  std::ostream* output_stream = nullptr;
+  if (output_file.good()) {
+    output_stream = &output_file;
+  }
+  if (init_summarizer_.HasProfiles()) {
+    WriteOutput("Profiling Info for Benchmark Initialization:",
+                init_summarizer_.GetOutputString(),
+                output_stream == nullptr ? &TFLITE_LOG(INFO) : output_stream);
+  }
+  if (run_summarizer_.HasProfiles()) {
+    WriteOutput("Operator-wise Profiling Info for Regular Benchmark Runs:",
+                run_summarizer_.GetOutputString(),
+                output_stream == nullptr ? &TFLITE_LOG(INFO) : output_stream);
+  }
+}
+
+void ProfilingListener::WriteOutput(const std::string& header,
+                                    const string& data, std::ostream* stream) {
+  (*stream) << header << std::endl;
+  (*stream) << data << std::endl;
+}
+
+}  // namespace benchmark
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/profiling_listener.h b/tensorflow/lite/tools/benchmark/profiling_listener.h
new file mode 100644
index 00000000000..84ef70d800d
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/profiling_listener.h
@@ -0,0 +1,53 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_PROFILING_LISTENER_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_PROFILING_LISTENER_H_
+
+#include "tensorflow/lite/profiling/buffered_profiler.h"
+#include "tensorflow/lite/profiling/profile_summarizer.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_model.h"
+
+namespace tflite {
+namespace benchmark {
+
+// Dumps profiling events if profiling is enabled.
+class ProfilingListener : public BenchmarkListener {
+ public:
+  explicit ProfilingListener(Interpreter* interpreter, uint32_t max_num_entries,
+                             std::string csv_file_path = "");
+
+  void OnBenchmarkStart(const BenchmarkParams& params) override;
+
+  void OnSingleRunStart(RunType run_type) override;
+
+  void OnSingleRunEnd() override;
+
+  void OnBenchmarkEnd(const BenchmarkResults& results) override;
+
+ private:
+  void WriteOutput(const std::string& header, const string& data,
+                   std::ostream* stream);
+  Interpreter* interpreter_;
+  profiling::BufferedProfiler profiler_;
+  profiling::ProfileSummarizer run_summarizer_;
+  profiling::ProfileSummarizer init_summarizer_;
+  std::string csv_file_path_;
+};
+
+}  // namespace benchmark
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_PROFILING_LISTENER_H_

From cf8d3b17aeff3da4cbd0afc301e1ca61af8df4f0 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Tue, 18 Feb 2020 12:42:24 +0000
Subject: [PATCH 114/442] Fix segmentation fault for CONV_2D with dilation.

---
 .../lite/toco/graph_transformations/identify_dilated_conv.cc  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc b/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc
index bb67b623f29..ab86f5d07c9 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc
@@ -86,7 +86,7 @@ bool ResolveDilatedConv(Model* model, Operator* conv_base_op, Operator* stb_op,
                            ? GetOpWithInput(*model, post_conv_op->outputs[0])
                            : GetOpWithInput(*model, conv_op->outputs[0]);
   bool has_pad_op = false;
-  if (pad_op->type == OperatorType::kPad) {
+  if (pad_op && pad_op->type == OperatorType::kPad) {
     has_pad_op = true;
     CHECK_EQ(pad_op->inputs.size(), 2);
     CHECK_EQ(pad_op->outputs.size(), 1);
@@ -128,7 +128,7 @@ bool ResolveDilatedConv(Model* model, Operator* conv_base_op, Operator* stb_op,
   if (!has_pad_op) {
     auto* pre_stb_pad_op = GetOpWithOutput(*model, stb_op->inputs[0]);
     // If it is a Pad Op then just rewire the Input of Pad Op with Input of STB
-    if (pre_stb_pad_op->type == OperatorType::kPad) {
+    if (pre_stb_pad_op && pre_stb_pad_op->type == OperatorType::kPad) {
       stb_op->inputs[0] = pre_stb_pad_op->inputs[0];
       has_pad_op = true;
       pad_op = pre_stb_pad_op;

From 5d74ae0f33e98b0082e85f2e05683d892b4041f4 Mon Sep 17 00:00:00 2001
From: Vincent Abriou <vincent.abriou@st.com>
Date: Fri, 3 Jan 2020 14:33:40 +0100
Subject: [PATCH 115/442] TFLite: pip package: support cross compilation
 environment variables

Add build environment variable to allow to cross compile TensorFlow Lite
pip package for other platform than Rpi or X86.

Signed-off-by: Vincent Abriou <vincent.abriou@st.com>
---
 .../tools/pip_package/build_pip_package.sh    |  7 ++++++-
 tensorflow/lite/tools/pip_package/setup.py    | 21 +++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/tools/pip_package/build_pip_package.sh b/tensorflow/lite/tools/pip_package/build_pip_package.sh
index df5423e4114..5a481b23124 100755
--- a/tensorflow/lite/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/lite/tools/pip_package/build_pip_package.sh
@@ -49,7 +49,12 @@ case "${TENSORFLOW_TARGET}" in
                        bdist_wheel --plat-name=linux-aarch64
     ;;
   *)
-    ${PYTHON} setup.py bdist bdist_wheel
+    if [[ -n "${TENSORFLOW_TARGET}" ]] && [[ -n "${TENSORFLOW_TARGET_ARCH}" ]]; then
+      ${PYTHON} setup.py bdist --plat-name=${TENSORFLOW_TARGET}-${TENSORFLOW_TARGET_ARCH} \
+                         bdist_wheel --plat-name=${TENSORFLOW_TARGET}-${TENSORFLOW_TARGET_ARCH}
+    else
+      ${PYTHON} setup.py bdist bdist_wheel
+    fi
     ;;
 esac
 
diff --git a/tensorflow/lite/tools/pip_package/setup.py b/tensorflow/lite/tools/pip_package/setup.py
index 90416b77bc7..9885d412b5a 100644
--- a/tensorflow/lite/tools/pip_package/setup.py
+++ b/tensorflow/lite/tools/pip_package/setup.py
@@ -50,6 +50,27 @@ elif TARGET == 'aarch64':
   os.environ['CC'] = 'aarch64-linux-gnu-gcc'
 MAKE_CROSS_OPTIONS = ['TARGET=%s' % TARGET]  if TARGET else []
 
+TARGET_ARCH = (
+    os.environ['TENSORFLOW_TARGET_ARCH'] \
+    if 'TENSORFLOW_TARGET_ARCH' in os.environ
+    else None)
+MAKE_CROSS_OPTIONS += ['TARGET_ARCH=%s' % TARGET_ARCH] \
+        if TARGET_ARCH else []
+
+CC_PREFIX = (
+    os.environ['TENSORFLOW_CC_PREFIX'] \
+    if 'TENSORFLOW_CC_PREFIX' in os.environ
+    else None)
+MAKE_CROSS_OPTIONS += ['CC_PREFIX=%s' % CC_PREFIX] \
+        if CC_PREFIX else []
+
+EXTRA_CXXFLAGS = (
+    os.environ['TENSORFLOW_EXTRA_CXXFLAGS'] \
+    if 'TENSORFLOW_EXTRA_CXXFLAGS' in os.environ
+    else None)
+MAKE_CROSS_OPTIONS += ['EXTRA_CXXFLAGS=%s' % EXTRA_CXXFLAGS] \
+        if EXTRA_CXXFLAGS else []
+
 RELATIVE_MAKE_DIR = os.path.join('tensorflow', 'lite', 'tools', 'make')
 MAKE_DIR = os.path.join(TENSORFLOW_DIR, RELATIVE_MAKE_DIR)
 DOWNLOADS_DIR = os.path.join(MAKE_DIR, 'downloads')

From 9eafb9aeaa53cd3b0f826d46fae5736baa23be0b Mon Sep 17 00:00:00 2001
From: Vincent ABRIOU <vincent.abriou@st.com>
Date: Tue, 18 Feb 2020 10:57:58 +0100
Subject: [PATCH 116/442] TFLite: pip_package: use local build directory

In some cross compilation environment, access of the root directory of the
host is forbidden. To avoid any issue while compilation, the use of a local
directory is preferable.
Further it will be aligned with the lite/tools/make/Makefile behavior.

Signed-off-by: Vincent ABRIOU <vincent.abriou@st.com>
---
 tensorflow/lite/tools/pip_package/Makefile             | 2 +-
 tensorflow/lite/tools/pip_package/build_pip_package.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/tools/pip_package/Makefile b/tensorflow/lite/tools/pip_package/Makefile
index 13233024ac8..eaca6e131b3 100644
--- a/tensorflow/lite/tools/pip_package/Makefile
+++ b/tensorflow/lite/tools/pip_package/Makefile
@@ -47,7 +47,7 @@ docker-build: docker-image
 		--volume $(OUT_DIR):/out \
 		$(TAG_IMAGE) \
 		/bin/bash -c "tensorflow/tensorflow/lite/tools/pip_package/build_pip_package.sh && \
-		              (cp /tmp/tflite_pip/*.deb /tmp/tflite_pip/$(PYTHON)/dist/{*.whl,*.tar.gz} /out 2>/dev/null || true)"
+		              (cp ${MAKEFILE_DIR}/gen/tflite_pip/*.deb ${MAKEFILE_DIR}/gen/tflite_pip/python3/dist/{*.whl,*.tar.gz} /out 2>/dev/null || true)"
 
 clean:
 	rm -rf $(CURDIR)/out
diff --git a/tensorflow/lite/tools/pip_package/build_pip_package.sh b/tensorflow/lite/tools/pip_package/build_pip_package.sh
index 5a481b23124..925c6142be0 100755
--- a/tensorflow/lite/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/lite/tools/pip_package/build_pip_package.sh
@@ -23,7 +23,7 @@ export TENSORFLOW_DIR="${SCRIPT_DIR}/../../../.."
 TENSORFLOW_LITE_DIR="${TENSORFLOW_DIR}/tensorflow/lite"
 TENSORFLOW_VERSION=$(grep "_VERSION = " "${TENSORFLOW_DIR}/tensorflow/tools/pip_package/setup.py" | cut -d= -f2 | sed "s/[ '-]//g")
 export PACKAGE_VERSION="${TENSORFLOW_VERSION}${VERSION_SUFFIX}"
-BUILD_DIR="/tmp/tflite_pip/${PYTHON}"
+BUILD_DIR="${SCRIPT_DIR}/gen/tflite_pip/python3"
 
 # Build source tree.
 rm -rf "${BUILD_DIR}" && mkdir -p "${BUILD_DIR}/tflite_runtime"

From 003afb8eadf5dd4a2bb91d927b420b7df9a0a312 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Tue, 18 Feb 2020 15:08:17 +0100
Subject: [PATCH 117/442] Correct ifdefs for TensorRT opt profile handling

---
 .../tf2tensorrt/utils/trt_shape_optimization_profiles.cc        | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
index 6f19b8ead1c..60c01ed31dc 100644
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_shape_optimization_profiles.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <functional>
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
 namespace tensorflow {
 namespace tensorrt {
 
@@ -180,3 +181,4 @@ int TrtShapeOptimizationProfile::GetNumProfiles() const {
 
 }  // namespace tensorrt
 }  // namespace tensorflow
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT

From 3d652feb19c0bc3cc5e3ac566675f253cc51f1a1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 11:11:59 +0000
Subject: [PATCH 118/442] llvm integration

PiperOrigin-RevId: 295697296
Change-Id: I74ed410bccf7c3880c545fc69cc43b3c6bca36f5
---
 third_party/mlir/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index a7537830fa2..86604027483 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -1727,6 +1727,7 @@ cc_library(
         "include/mlir/InitAllDialects.h",
         "include/mlir/InitAllPasses.h",
     ],
+    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     deps = [
         ":AffineOps",
         ":Analysis",

From 5dfb848658a416a831d1226fac2bb7bb42c0ade6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 04:46:09 -0800
Subject: [PATCH 119/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 295708246 Change-Id:
 I7c952c79f7d690b6a1a5b006a78f1c3da030b82a

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index ffa9931d561..86be1ef98aa 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21329,7 +21329,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22037,7 +22037,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22233,7 +22233,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22302,7 +22302,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22417,7 +22417,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22476,7 +22476,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22650,7 +22650,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22841,7 +22841,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25281,7 +25281,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25613,7 +25613,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25663,7 +25663,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25913,7 +25913,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26543,7 +26543,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27608,7 +27608,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45467,7 +45467,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 26e9f9ca494fbb74618121c55e255ed8b7886eaa Mon Sep 17 00:00:00 2001
From: Tiezhen WANG <wangtz@google.com>
Date: Tue, 18 Feb 2020 04:56:51 -0800
Subject: [PATCH 120/442] Automated rollback of commit
 37123e9e82bf34002b656753970fde832c2708af

PiperOrigin-RevId: 295709352
Change-Id: I57f6223335cec07ddb701fa369be31452a72c34d
---
 tensorflow/lite/micro/micro_interpreter.cc    | 104 +++++++++---------
 tensorflow/lite/micro/micro_interpreter.h     |   5 +-
 .../lite/micro/micro_interpreter_test.cc      |  79 +++++++------
 3 files changed, 97 insertions(+), 91 deletions(-)

diff --git a/tensorflow/lite/micro/micro_interpreter.cc b/tensorflow/lite/micro/micro_interpreter.cc
index 31e75690597..2326c2d2163 100644
--- a/tensorflow/lite/micro/micro_interpreter.cc
+++ b/tensorflow/lite/micro/micro_interpreter.cc
@@ -52,8 +52,7 @@ MicroInterpreter::MicroInterpreter(const Model* model,
       error_reporter_(error_reporter),
       allocator_(&context_, model_, tensor_arena, tensor_arena_size,
                  error_reporter_),
-      tensors_allocated_(false),
-      tensors_prepared_(false) {
+      tensors_allocated_(false) {
   const flatbuffers::Vector<flatbuffers::Offset<SubGraph>>* subgraphs =
       model->subgraphs();
   if (subgraphs->size() != 1) {
@@ -86,6 +85,21 @@ MicroInterpreter::MicroInterpreter(const Model* model,
   initialization_status_ = kTfLiteOk;
 }
 
+MicroInterpreter::~MicroInterpreter() {
+  if (node_and_registrations_ != nullptr) {
+    for (size_t i = 0; i < operators_->size(); ++i) {
+      TfLiteNode* node = &(node_and_registrations_[i].node);
+      const TfLiteRegistration* registration =
+          node_and_registrations_[i].registration;
+      // registration is allocated outside the interpreter, so double check to
+      // make sure it's not nullptr;
+      if (registration != nullptr && registration->free != nullptr) {
+        registration->free(&context_, node->user_data);
+      }
+    }
+  }
+}
+
 void MicroInterpreter::CorrectTensorEndianness(TfLiteTensor* tensorCorr) {
   int32_t tensorSize = 1;
   for (int d = 0; d < tensorCorr->dims->size; ++d)
@@ -128,8 +142,41 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
                                    op_resolver_, &node_and_registrations_));
   TF_LITE_ENSURE_OK(&context_, allocator_.FinishTensorAllocation());
 
-  tensors_allocated_ = true;
-  return kTfLiteOk;
+  // Init method is not yet implemented.
+  for (size_t i = 0; i < operators_->size(); ++i) {
+    auto* node = &(node_and_registrations_[i].node);
+    auto* registration = node_and_registrations_[i].registration;
+    size_t init_data_size;
+    const char* init_data;
+    if (registration->builtin_code == BuiltinOperator_CUSTOM) {
+      init_data = reinterpret_cast<const char*>(node->custom_initial_data);
+      init_data_size = node->custom_initial_data_size;
+    } else {
+      init_data = reinterpret_cast<const char*>(node->builtin_data);
+      init_data_size = 0;
+    }
+    if (registration->init) {
+      node->user_data =
+          registration->init(&context_, init_data, init_data_size);
+    }
+  }
+
+    for (size_t i = 0; i < operators_->size(); ++i) {
+      auto* node = &(node_and_registrations_[i].node);
+      auto* registration = node_and_registrations_[i].registration;
+      if (registration->prepare) {
+        TfLiteStatus prepare_status = registration->prepare(&context_, node);
+        if (prepare_status != kTfLiteOk) {
+          error_reporter_->Report(
+              "Node %s (number %d) failed to prepare with status %d",
+              OpNameFromRegistration(registration), i, prepare_status);
+          return kTfLiteError;
+        }
+      }
+    }
+
+    tensors_allocated_ = true;
+    return kTfLiteOk;
 }
 
 TfLiteStatus MicroInterpreter::Invoke() {
@@ -144,45 +191,6 @@ TfLiteStatus MicroInterpreter::Invoke() {
     AllocateTensors();
   }
 
-  // Init method is not yet implemented.
-  for (size_t i = 0; i < operators_->size(); ++i) {
-    auto* node = &(node_and_registrations_[i].node);
-    auto* registration = node_and_registrations_[i].registration;
-    size_t init_data_size;
-    const char* init_data;
-    if (registration->builtin_code == BuiltinOperator_CUSTOM) {
-      init_data = reinterpret_cast<const char*>(node->custom_initial_data);
-      init_data_size = node->custom_initial_data_size;
-    } else {
-      init_data = reinterpret_cast<const char*>(node->builtin_data);
-      init_data_size = 0;
-    }
-    if (!tensors_prepared_ && registration->init) {
-      node->user_data =
-          registration->init(&context_, init_data, init_data_size);
-    }
-  }
-
-  if (!tensors_prepared_) {
-    for (size_t i = 0; i < operators_->size(); ++i) {
-      auto* node = &(node_and_registrations_[i].node);
-      auto* registration = node_and_registrations_[i].registration;
-      if (registration->prepare) {
-        TfLiteStatus prepare_status = registration->prepare(&context_, node);
-        if (prepare_status != kTfLiteOk) {
-          error_reporter_->Report(
-              "Node %s (number %d) failed to prepare with status %d",
-              OpNameFromRegistration(registration), i, prepare_status);
-          return kTfLiteError;
-        }
-      }
-    }
-#ifdef TF_LITE_MICRO_TENSORS_PREPARED
-    // TODO(b/148085107): Turn this value on by default.
-    tensors_prepared_ = true;
-#endif
-  }
-
   for (size_t i = 0; i < operators_->size(); ++i) {
     auto* node = &(node_and_registrations_[i].node);
     auto* registration = node_and_registrations_[i].registration;
@@ -197,16 +205,6 @@ TfLiteStatus MicroInterpreter::Invoke() {
       }
     }
   }
-
-  // This is actually a no-op.
-  // TODO(wangtz): Consider removing this code to slightly reduce binary size.
-  for (size_t i = 0; i < operators_->size(); ++i) {
-    auto* node = &(node_and_registrations_[i].node);
-    auto* registration = node_and_registrations_[i].registration;
-    if (registration->free) {
-      registration->free(&context_, node->user_data);
-    }
-  }
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/micro/micro_interpreter.h b/tensorflow/lite/micro/micro_interpreter.h
index 941960a5116..4d02769cc3b 100644
--- a/tensorflow/lite/micro/micro_interpreter.h
+++ b/tensorflow/lite/micro/micro_interpreter.h
@@ -39,6 +39,8 @@ class MicroInterpreter {
                    uint8_t* tensor_arena, size_t tensor_arena_size,
                    ErrorReporter* error_reporter);
 
+  ~MicroInterpreter();
+
   // Runs through the model and allocates all necessary input, output and
   // intermediate tensors.
   TfLiteStatus AllocateTensors();
@@ -109,7 +111,7 @@ class MicroInterpreter {
   template <class T>
   void CorrectTensorDataEndianness(T* data, int32_t size);
 
-  NodeAndRegistration* node_and_registrations_;
+  NodeAndRegistration* node_and_registrations_ = nullptr;
 
   const Model* model_;
   const OpResolver& op_resolver_;
@@ -117,7 +119,6 @@ class MicroInterpreter {
   TfLiteContext context_ = {};
   MicroAllocator allocator_;
   bool tensors_allocated_;
-  bool tensors_prepared_;
 
   TfLiteStatus initialization_status_;
   const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors_;
diff --git a/tensorflow/lite/micro/micro_interpreter_test.cc b/tensorflow/lite/micro/micro_interpreter_test.cc
index 6d0deca6593..5ca2c3aaae2 100644
--- a/tensorflow/lite/micro/micro_interpreter_test.cc
+++ b/tensorflow/lite/micro/micro_interpreter_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 namespace tflite {
 namespace {
+
 void* MockInit(TfLiteContext* context, const char* buffer, size_t length) {
   // We don't support delegate in TFL micro. This is a weak check to test if
   // context struct being zero-initialized.
@@ -31,9 +32,8 @@ void* MockInit(TfLiteContext* context, const char* buffer, size_t length) {
   return nullptr;
 }
 
-void MockFree(TfLiteContext* context, void* buffer) {
-  // Do nothing.
-}
+bool freed = false;
+void MockFree(TfLiteContext* context, void* buffer) { freed = true; }
 
 TfLiteStatus MockPrepare(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
@@ -75,49 +75,56 @@ class MockOpResolver : public OpResolver {
 TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestInterpreter) {
+  tflite::freed = false;
   const tflite::Model* model = tflite::testing::GetSimpleMockModel();
   TF_LITE_MICRO_EXPECT_NE(nullptr, model);
   tflite::MockOpResolver mock_resolver;
   constexpr size_t allocator_buffer_size = 1024;
   uint8_t allocator_buffer[allocator_buffer_size];
-  tflite::MicroInterpreter interpreter(model, mock_resolver, allocator_buffer,
-                                       allocator_buffer_size,
-                                       micro_test::reporter);
-  TF_LITE_MICRO_EXPECT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
-  TF_LITE_MICRO_EXPECT_EQ(1, interpreter.inputs_size());
-  TF_LITE_MICRO_EXPECT_EQ(2, interpreter.outputs_size());
 
-  TfLiteTensor* input = interpreter.input(0);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, input);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, input->type);
-  TF_LITE_MICRO_EXPECT_EQ(1, input->dims->size);
-  TF_LITE_MICRO_EXPECT_EQ(1, input->dims->data[0]);
-  TF_LITE_MICRO_EXPECT_EQ(4, input->bytes);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, input->data.i32);
-  input->data.i32[0] = 21;
+  // Create a new scope so that we can test the destructor.
+  {
+    tflite::MicroInterpreter interpreter(model, mock_resolver, allocator_buffer,
+                                         allocator_buffer_size,
+                                         micro_test::reporter);
+    TF_LITE_MICRO_EXPECT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+    TF_LITE_MICRO_EXPECT_EQ(1, interpreter.inputs_size());
+    TF_LITE_MICRO_EXPECT_EQ(2, interpreter.outputs_size());
 
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, interpreter.Invoke());
+    TfLiteTensor* input = interpreter.input(0);
+    TF_LITE_MICRO_EXPECT_NE(nullptr, input);
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, input->type);
+    TF_LITE_MICRO_EXPECT_EQ(1, input->dims->size);
+    TF_LITE_MICRO_EXPECT_EQ(1, input->dims->data[0]);
+    TF_LITE_MICRO_EXPECT_EQ(4, input->bytes);
+    TF_LITE_MICRO_EXPECT_NE(nullptr, input->data.i32);
+    input->data.i32[0] = 21;
 
-  TfLiteTensor* output = interpreter.output(0);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, output);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, output->type);
-  TF_LITE_MICRO_EXPECT_EQ(1, output->dims->size);
-  TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[0]);
-  TF_LITE_MICRO_EXPECT_EQ(4, output->bytes);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, output->data.i32);
-  TF_LITE_MICRO_EXPECT_EQ(42, output->data.i32[0]);
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, interpreter.Invoke());
 
-  output = interpreter.output(1);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, output);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, output->type);
-  TF_LITE_MICRO_EXPECT_EQ(1, output->dims->size);
-  TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[0]);
-  TF_LITE_MICRO_EXPECT_EQ(4, output->bytes);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, output->data.i32);
-  TF_LITE_MICRO_EXPECT_EQ(42, output->data.i32[0]);
+    TfLiteTensor* output = interpreter.output(0);
+    TF_LITE_MICRO_EXPECT_NE(nullptr, output);
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, output->type);
+    TF_LITE_MICRO_EXPECT_EQ(1, output->dims->size);
+    TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[0]);
+    TF_LITE_MICRO_EXPECT_EQ(4, output->bytes);
+    TF_LITE_MICRO_EXPECT_NE(nullptr, output->data.i32);
+    TF_LITE_MICRO_EXPECT_EQ(42, output->data.i32[0]);
 
-  // Just to make sure that this method works.
-  tflite::PrintInterpreterState(&interpreter);
+    output = interpreter.output(1);
+    TF_LITE_MICRO_EXPECT_NE(nullptr, output);
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, output->type);
+    TF_LITE_MICRO_EXPECT_EQ(1, output->dims->size);
+    TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[0]);
+    TF_LITE_MICRO_EXPECT_EQ(4, output->bytes);
+    TF_LITE_MICRO_EXPECT_NE(nullptr, output->data.i32);
+    TF_LITE_MICRO_EXPECT_EQ(42, output->data.i32[0]);
+
+    // Just to make sure that this method works.
+    tflite::PrintInterpreterState(&interpreter);
+  }
+
+  TF_LITE_MICRO_EXPECT_EQ(tflite::freed, true);
 }
 
 TF_LITE_MICRO_TEST(TestVariableTensorReset) {

From 69c24e56aa883f451612aba18f2d220adc2b59b1 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Tue, 18 Feb 2020 05:01:08 -0800
Subject: [PATCH 121/442] Bump open source llvm revision to
 da147ef0a5c6d31c21d31a52b97235a629830c15

This lets us drop the dep from cuda transforms to all targets, which led to increases in binary size.

PiperOrigin-RevId: 295709715
Change-Id: I1c8dd4969b1df455f80aa800f6b4c1b6c0de65ae
---
 tensorflow/workspace.bzl | 4 ++--
 third_party/mlir/BUILD   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index fd53d7cd000..dfe6a9e4499 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -595,8 +595,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "fe36127982e0a5889cc0653718e62ba6acccf7c4"
-    LLVM_SHA256 = "d103d295c4825de37ea5adedd4ce28cbbca3ced59e445e4ab979219f83a0bd89"
+    LLVM_COMMIT = "da147ef0a5c6d31c21d31a52b97235a629830c15"
+    LLVM_SHA256 = "b5f85e5338f3ef7fd5f16f1307471f8545705985bd2e5423f67b58f58aedf24b"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 86604027483..efab4468ed5 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -700,8 +700,8 @@ cc_library(
         ":Pass",
         ":Support",
         ":TargetNVVMIR",
-        "@llvm-project//llvm:all_targets",
         "@llvm-project//llvm:core",
+        "@llvm-project//llvm:nvptx_code_gen",
         "@llvm-project//llvm:support",
         "@llvm-project//llvm:target",
     ],

From 89c08a546c5d88d6981e5f7a463519ebb6d3b5a1 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Tue, 18 Feb 2020 06:58:04 -0800
Subject: [PATCH 122/442] Add no_pip tags to tests.

These tests do not work when running on the nightly pip Kokoros.
They fail with errors like:
ModuleNotFoundError: No module named 'tensorflow.compiler.tests'
PiperOrigin-RevId: 295724751
Change-Id: I8df48e8a4663e0e1f67cf07d39691b428a0d9f0e
---
 tensorflow/compiler/tests/BUILD | 269 +++++++++++++++++++++++++++++---
 1 file changed, 250 insertions(+), 19 deletions(-)

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 77dbb1919be..e4b06a2e539 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -66,6 +66,9 @@ py_test(
     size = "small",
     srcs = ["xla_test_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
     ],
@@ -76,6 +79,9 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["adadelta_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -90,6 +96,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["adagrad_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -105,6 +114,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["adagrad_da_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -119,6 +131,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["adam_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -136,6 +151,9 @@ tf_xla_py_test(
     # TensorList ops are not implemented in the on-demand compilation model yet.
     disabled_backends = ["cpu_ondemand"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -151,6 +169,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["argminmax_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -168,6 +189,7 @@ tf_xla_py_test(
     shard_count = 5,
     tags = [
         "no_oss",  # TODO(b/148108508): Re-enable this test in OSS.
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "optonly",  # Times out frequently in fastbuild mode.
     ],
     deps = [
@@ -194,6 +216,7 @@ tf_xla_py_test(
     python_version = "PY3",
     shard_count = 2,
     tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "optonly",  # Times out frequently in fastbuild mode.
     ],
     deps = [
@@ -212,6 +235,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["bucketize_op_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -226,7 +252,10 @@ tf_xla_py_test(
     size = "small",
     srcs = ["categorical_op_test.py"],
     python_version = "PY3",
-    tags = ["optonly"],
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "optonly",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:framework",
@@ -242,6 +271,7 @@ tf_xla_py_test(
     srcs = ["cholesky_op_test.py"],
     python_version = "PY3",
     tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "no_rocm",
         "optonly",
     ],
@@ -261,6 +291,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["cond_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/compiler/tf2xla/python:xla",
@@ -278,7 +311,10 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["self_adjoint_eig_op_test.py"],
     python_version = "PY3",
-    tags = ["optonly"],
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "optonly",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -297,6 +333,9 @@ tf_xla_py_test(
     timeout = "moderate",
     srcs = ["searchsorted_op_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:platform_test",
@@ -314,6 +353,7 @@ tf_xla_py_test(
     ],
     python_version = "PY3",
     tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "no_rocm",
         "optonly",
     ],
@@ -336,6 +376,7 @@ tf_xla_py_test(
     srcs = ["matrix_inverse_op_test.py"],
     python_version = "PY3",
     tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "noasan",
         "nomsan",
         "notsan",
@@ -356,6 +397,9 @@ tf_xla_py_test(
     timeout = "moderate",
     srcs = ["matrix_solve_op_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:linalg_ops",
@@ -371,7 +415,10 @@ tf_xla_py_test(
     timeout = "moderate",
     srcs = ["matrix_triangular_solve_op_test.py"],
     python_version = "PY3",
-    tags = ["optonly"],
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "optonly",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -387,6 +434,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["clustering_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -403,6 +453,7 @@ tf_xla_py_test(
     python_version = "PY3",
     tags = [
         "many_xla_args",
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "no_rocm",
     ],
     deps = [
@@ -423,6 +474,9 @@ tf_xla_py_test(
     srcs = ["conv2d_test.py"],
     python_version = "PY3",
     shard_count = 10,
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":test_utils",
         ":xla_test",
@@ -442,6 +496,9 @@ tf_xla_py_test(
     srcs = ["conv3d_test.py"],
     python_version = "PY3",
     shard_count = 5,
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -460,6 +517,7 @@ tf_xla_py_test(
     python_version = "PY3",
     shard_count = 5,
     tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "no_rocm",
         "noasan",
         "nomsan",
@@ -482,6 +540,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["dynamic_slice_ops_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         "//tensorflow/compiler/tests:xla_test",
         "//tensorflow/compiler/tf2xla/python:xla",
@@ -499,6 +560,9 @@ tf_xla_py_test(
         "gpu",
     ],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -513,6 +577,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["reshape_op_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         "//tensorflow/compiler/tests:xla_test",
         "//tensorflow/compiler/tf2xla/python:xla",
@@ -527,6 +594,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["dynamic_stitch_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -541,6 +611,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["extract_image_patches_op_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -556,6 +629,7 @@ tf_xla_py_test(
     python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
     deps = [
         ":xla_test",
@@ -574,6 +648,9 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["fifo_queue_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -591,6 +668,7 @@ tf_xla_py_test(
     python_version = "PY3",
     shard_count = 6,
     tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "no_rocm",
         "optonly",
     ],
@@ -609,6 +687,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["slice_ops_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -623,6 +704,9 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["ftrl_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -638,6 +722,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["function_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -653,6 +740,7 @@ tf_xla_py_test(
     python_version = "PY3",
     shard_count = 10,
     tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "optonly",  # Times out frequently in fastbuild mode.
     ],
     deps = [
@@ -669,6 +757,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["listdiff_op_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -685,6 +776,9 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["lrn_ops_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -700,6 +794,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["manip_ops_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -715,7 +812,10 @@ tf_xla_py_test(
     timeout = "long",
     srcs = ["matrix_band_part_test.py"],
     python_version = "PY3",
-    tags = ["optonly"],
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "optonly",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -731,6 +831,9 @@ tf_xla_py_test(
     timeout = "long",
     srcs = ["matrix_diag_ops_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -744,6 +847,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["momentum_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -759,6 +865,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["nary_ops_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -773,6 +882,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["nullary_ops_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:control_flow_ops",
@@ -787,6 +899,9 @@ tf_xla_py_test(
     srcs = ["pooling_ops_test.py"],
     python_version = "PY3",
     shard_count = 10,
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -803,6 +918,9 @@ tf_xla_py_test(
     srcs = ["pooling_ops_3d_test.py"],
     python_version = "PY3",
     shard_count = 10,
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -818,6 +936,9 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["proximal_adagrad_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -832,6 +953,9 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["proximal_gradient_descent_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -852,7 +976,10 @@ tf_xla_py_test(
     ],
     python_version = "PY3",
     shard_count = 5,
-    tags = ["optonly"],
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "optonly",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -871,6 +998,7 @@ tf_xla_py_test(
     python_version = "PY3",
     shard_count = 5,
     tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "no_rocm",
         "optonly",
     ],
@@ -892,6 +1020,7 @@ tf_xla_py_test(
     python_version = "PY3",
     shard_count = 10,
     tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "notap",  # TODO(b/141057424): flaky on TPU
     ],
     deps = [
@@ -911,6 +1040,9 @@ tf_xla_py_test(
     srcs = ["reduce_ops_test.py"],
     python_version = "PY3",
     shard_count = 5,
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -927,6 +1059,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["reduce_window_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/compiler/tf2xla/python:xla",
@@ -943,6 +1078,9 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["reverse_ops_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -955,7 +1093,10 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["reverse_sequence_op_test.py"],
     python_version = "PY3",
-    tags = ["optonly"],
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "optonly",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -969,6 +1110,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["rmsprop_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -984,7 +1128,10 @@ tf_xla_py_test(
     size = "small",
     srcs = ["scan_ops_test.py"],
     python_version = "PY3",
-    tags = ["optonly"],
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "optonly",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -999,6 +1146,9 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["segment_reduction_ops_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -1015,6 +1165,9 @@ tf_xla_py_test(
     srcs = ["spacetobatch_op_test.py"],
     python_version = "PY3",
     shard_count = 3,
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -1029,6 +1182,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["sparse_to_dense_op_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -1043,7 +1199,10 @@ tf_xla_py_test(
     size = "small",
     srcs = ["stack_ops_test.py"],
     python_version = "PY3",
-    tags = ["config-cuda-only"],
+    tags = [
+        "config-cuda-only",
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     use_xla_device = False,
     deps = [
         ":xla_test",
@@ -1060,7 +1219,10 @@ tf_xla_py_test(
     srcs = ["stateful_random_ops_test.py"],
     python_version = "PY3",
     shard_count = 10,
-    tags = ["optonly"],
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "optonly",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:framework",
@@ -1076,7 +1238,10 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["stateless_random_ops_test.py"],
     python_version = "PY3",
-    tags = ["optonly"],
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "optonly",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:framework",
@@ -1096,6 +1261,7 @@ tf_xla_py_test(
     python_version = "PY3",
     tags = [
         "config-cuda-only",
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "v1only",
     ],
     use_xla_device = False,
@@ -1121,6 +1287,9 @@ tf_xla_py_test(
     # TensorList ops are not implemented in the on-demand compilation model yet.
     disabled_backends = ["cpu_ondemand"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -1136,6 +1305,9 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["ternary_ops_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -1152,6 +1324,9 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["unary_ops_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -1168,6 +1343,9 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["fused_batchnorm_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":test_utils",
         ":xla_test",
@@ -1188,7 +1366,10 @@ tf_xla_py_test(
     size = "small",
     srcs = ["variable_ops_test.py"],
     python_version = "PY3",
-    tags = ["optonly"],
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "optonly",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -1207,6 +1388,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["while_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/compiler/tf2xla/python:xla",
@@ -1237,6 +1421,9 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["gather_nd_op_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -1250,7 +1437,10 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["scatter_nd_op_test.py"],
     python_version = "PY3",
-    tags = ["optonly"],
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "optonly",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -1266,7 +1456,10 @@ tf_xla_py_test(
     python_version = "PY3",
     shard_count = 1,
     # Times out in fastbuild mode.
-    tags = ["optonly"],
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "optonly",
+    ],
     deps = [
         "//tensorflow/compiler/tests:xla_test",
         "//tensorflow/compiler/tf2xla/python:xla",
@@ -1280,6 +1473,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["data_format_ops_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         "//tensorflow/compiler/tests:xla_test",
         "//tensorflow/python:array_ops",
@@ -1294,7 +1490,10 @@ tf_xla_py_test(
     size = "small",
     srcs = ["xla_device_test.py"],
     python_version = "PY3",
-    tags = ["optonly"],
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "optonly",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -1307,6 +1506,9 @@ cuda_py_test(
     name = "xla_device_gpu_test",
     size = "small",
     srcs = ["xla_device_gpu_test.py"],
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     xla_enable_strict_auto_jit = False,
     deps = [
         "//tensorflow/python:array_ops",
@@ -1323,7 +1525,10 @@ cuda_py_test(
     size = "medium",
     srcs = ["jit_test.py"],
     shard_count = 5,
-    tags = ["no_rocm"],
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "no_rocm",
+    ],
     xla_enable_strict_auto_jit = False,
     deps = [
         ":test_utils",
@@ -1344,7 +1549,10 @@ cuda_py_test(
     name = "dense_layer_test",
     size = "medium",
     srcs = ["dense_layer_test.py"],
-    tags = ["no_rocm"],
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "no_rocm",
+    ],
     xla_enable_strict_auto_jit = False,
     deps = [
         ":test_utils",
@@ -1385,6 +1593,7 @@ tf_cuda_cc_test(
     size = "large",
     # This test is randomized, so only run it if explicitly requested.
     tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "manual",
         "notap",
     ] + tf_cuda_tests_tags(),
@@ -1394,7 +1603,9 @@ tf_cuda_cc_test(
 tf_cuda_cc_test(
     name = "unary_ops_composition_test",
     srcs = ["unary_ops_composition_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ] + tf_cuda_tests_tags(),
     deps = [
         "//tensorflow/cc:cc_ops",
         "//tensorflow/compiler/jit",
@@ -1430,7 +1641,10 @@ py_library(
 cuda_py_test(
     name = "lstm_test",
     srcs = ["lstm_test.py"],
-    tags = ["no_rocm"],
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "no_rocm",
+    ],
     xla_enable_strict_auto_jit = False,
     deps = [
         ":lstm",
@@ -1474,6 +1688,9 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["fake_quant_ops_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:framework",
@@ -1486,6 +1703,9 @@ tf_xla_py_test(
     size = "small",
     srcs = ["placeholder_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -1499,6 +1719,9 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["quantized_ops_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/compiler/tf2xla/python:xla",
@@ -1516,6 +1739,9 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["xla_ops_test.py"],
     python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/compiler/tf2xla/python:xla",
@@ -1535,6 +1761,7 @@ tf_xla_py_test(
     shard_count = 5,
     tags = [
         "no_oss",  # TODO(b/148108508): Re-enable this test in OSS.
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "no_rocm",
     ],
     deps = [
@@ -1560,6 +1787,7 @@ tf_xla_py_test(
     ],
     python_version = "PY3",
     tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "optonly",
     ],
     deps = [
@@ -1576,7 +1804,10 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["special_math_test.py"],
     shard_count = 5,
-    tags = ["optonly"],
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "optonly",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:extra_py_tests_deps",

From 49aa204fc368c1be7064896aa85b45a9806e9858 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 08:51:01 -0800
Subject: [PATCH 123/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 295743668 Change-Id:
 I3d496b19f6adda78c6c4f8b277d6566a975820fc

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 86be1ef98aa..ffa9931d561 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21329,7 +21329,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22037,7 +22037,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22233,7 +22233,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22302,7 +22302,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22417,7 +22417,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22476,7 +22476,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22650,7 +22650,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22841,7 +22841,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25281,7 +25281,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25613,7 +25613,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25663,7 +25663,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25913,7 +25913,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26543,7 +26543,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27608,7 +27608,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45467,7 +45467,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From b9a0bd18a9689bb35237757baecc6b6367a43b1d Mon Sep 17 00:00:00 2001
From: Alex Stark <starka@google.com>
Date: Tue, 18 Feb 2020 09:18:52 -0800
Subject: [PATCH 124/442] Depthwise convolution 3x3 per-channel int8 for
 dot-product ARM (13).

PiperOrigin-RevId: 295749216
Change-Id: Ieea413c1e525a06e04fe957603d167cda46e3318
---
 .../depthwiseconv_uint8_3x3_filter.h          | 402 +++++++++++-------
 .../depthwiseconv_uint8_transitional.h        |  62 ++-
 2 files changed, 271 insertions(+), 193 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index e0f120415af..7ff5018ba37 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -27,6 +27,21 @@ namespace tflite {
 namespace optimized_ops {
 namespace depthwise_conv {
 
+#ifdef USE_NEON
+inline int8x16_t util_vld1q_x8(const uint8* data_addr) {
+  return vreinterpretq_s8_u8(vld1q_u8(data_addr));
+}
+inline int8x16_t util_vld1q_x8(const int8* data_addr) {
+  return vld1q_s8(data_addr);
+}
+inline int8x8_t util_vld1_x8(const uint8* data_addr) {
+  return vreinterpret_s8_u8(vld1_u8(data_addr));
+}
+inline int8x8_t util_vld1_x8(const int8* data_addr) {
+  return vld1_s8(data_addr);
+}
+#endif
+
 #define STR(s) STR_UNEXPANDED(s)
 #define STR_UNEXPANDED(s) #s
 
@@ -5907,13 +5922,15 @@ struct ProcessPerDepth<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
   }
 };
 
-template <>
+template <QuantizationType quantization_type>
 struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
-                      QuantizationType::kNonPerChannelUint8,
+                      quantization_type,
                       DepthwiseConvDepthMultiplication::kNoMultiplication,
                       /*max_padding=*/0> {
   static inline void PackMacroBlockNeon(
-      const uint8* input_block_data, int8* scratch_block_data,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8* scratch_block_data,
       const DepthwiseConvDotProdParams* function_params) {
     TFLITE_DCHECK_EQ(function_params->padding_bottom, 0);
     TFLITE_DCHECK_EQ(function_params->padding_top, 0);
@@ -5932,7 +5949,8 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
     const int input_depth = function_params->input_depth;
 
     TFLITE_DCHECK_GE(depth_micro_repeats, 0);
-    constexpr uint8 kSignBit = 0x80;
+    constexpr uint8 kSignBit =
+        QuantizationTypeImpl<quantization_type>::kUint8SignBit;
     const int micro_block_size = 4 * 8;
     const int depth_advance = width_overall_micro_repeats * micro_block_size;
     const int width_advance =
@@ -5948,14 +5966,14 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
     int8x16_t work_reg_b;
 
     // Effect subtraction of zero-point = 128 by XOR of sign bit.
-    const int8x16_t sign_bit = vdupq_n_s8(kSignBit);
+    const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
 
     // Work through one slice, by row, at a time.
     int8* scratch_data_0 = scratch_block_data;
 
     for (int k_height = 0; k_height < block_height; ++k_height) {
-      const int8* input_data_0 =
-          reinterpret_cast<const int8*>(input_block_data);
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_data_0 = input_block_data;
       int8x16_t input_data_a;
       int8x16_t input_data_b;
       int8x16_t input_data_c;
@@ -5976,29 +5994,27 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         if (depth_micro_repeats >= 2) {
           i_depth += 2;
 
-          //
-
-          input_data_a = vld1q_s8(input_data_0);
-          input_data_b = vld1q_s8(input_data_0 + 1 * input_depth);
-          input_data_c = vld1q_s8(input_data_0 + 2 * input_depth);
-          input_data_d = vld1q_s8(input_data_0 + 3 * input_depth);
+          input_data_a = util_vld1q_x8(input_data_0);
+          input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
+          input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
+          input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
           input_data_0 += 16;
 
-          //
-
           for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
             work_reg_a = vzip1q_s8(input_data_a, input_data_b);
             work_reg_b = vzip1q_s8(input_data_c, input_data_d);
             vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
-            work_reg_a = veorq_s8(work_reg_a, sign_bit);
-            work_reg_b = veorq_s8(work_reg_b, sign_bit);
+            if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+              work_reg_a = veorq_s8(work_reg_a, sign_bit);
+              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+            }
 
             work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
             work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
             vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
 
-            input_data_a = vld1q_s8(input_data_0);
-            input_data_b = vld1q_s8(input_data_0 + 1 * input_depth);
+            input_data_a = util_vld1q_x8(input_data_0);
+            input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
             optimized_ops_prefetch_write_l1_keep(scratch_data_0);
             optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
             vst1q_s8(scratch_data_0, work_reg_a);
@@ -6006,41 +6022,43 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
 
             scratch_data_0 += depth_advance;
 
-            work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
-            work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+            if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+              work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+              work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+            }
 
-            input_data_c = vld1q_s8(input_data_0 + 2 * input_depth);
-            input_data_d = vld1q_s8(input_data_0 + 3 * input_depth);
+            input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
+            input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
             optimized_ops_prefetch_write_l1_keep(scratch_data_0);
             optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
             vst1q_s8(scratch_data_0, work_reg_a_sp);
             vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
 
             scratch_data_0 += depth_advance;
-
-            //
-
             input_data_0 += 16;
           }
 
           work_reg_a = vzip1q_s8(input_data_a, input_data_b);
           work_reg_b = vzip1q_s8(input_data_c, input_data_d);
           vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
-          work_reg_a = veorq_s8(work_reg_a, sign_bit);
-          work_reg_b = veorq_s8(work_reg_b, sign_bit);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            work_reg_a = veorq_s8(work_reg_a, sign_bit);
+            work_reg_b = veorq_s8(work_reg_b, sign_bit);
+          }
           optimized_ops_prefetch_write_l1_keep(scratch_data_0);
           optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
           vst1q_s8(scratch_data_0, work_reg_a);
           vst1q_s8(scratch_data_0 + 16, work_reg_b);
 
           scratch_data_0 += depth_advance;
-          //
 
           work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
           work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
           vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
-          work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
-          work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+            work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+          }
 
           optimized_ops_prefetch_write_l1_keep(scratch_data_0);
           optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
@@ -6063,8 +6081,10 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
           input_data_0 += 8;
 
           vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
-          work_reg_a = veorq_s8(work_reg_a, sign_bit);
-          work_reg_b = veorq_s8(work_reg_b, sign_bit);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            work_reg_a = veorq_s8(work_reg_a, sign_bit);
+            work_reg_b = veorq_s8(work_reg_b, sign_bit);
+          }
 
           optimized_ops_prefetch_write_l1_keep(scratch_data_0);
           optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
@@ -6082,9 +6102,9 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         TFLITE_DCHECK_GT(residual_width, 0);
         TFLITE_DCHECK_LT(residual_width, 4);
         for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
-          input_data_c = vdupq_n_s8(kSignBit);
+          input_data_c = vdupq_n_u8(kSignBit);
           input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
-          input_data_d = vdupq_n_s8(kSignBit);
+          input_data_d = vdupq_n_u8(kSignBit);
           if (residual_width > 1) {
             input_data_b =
                 vld1q_lane_s8x8(input_data_0 + input_depth, input_data_b, 0);
@@ -6096,8 +6116,10 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
           work_reg_a = vzip1q_s8(input_data_a, input_data_b);
           work_reg_b = vzip1q_s8(input_data_c, input_data_d);
 
-          work_reg_a = veorq_s8(work_reg_a, sign_bit);
-          work_reg_b = veorq_s8(work_reg_b, sign_bit);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            work_reg_a = veorq_s8(work_reg_a, sign_bit);
+            work_reg_b = veorq_s8(work_reg_b, sign_bit);
+          }
           vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
 
           optimized_ops_prefetch_write_l1_keep(scratch_data_0);
@@ -6111,6 +6133,7 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         scratch_data_0 += width_advance;
         input_data_0 += input_depth_skip;
       }
+
       scratch_data_0 += height_advance;
       input_block_data += input_height_stride;
     }
@@ -6121,23 +6144,28 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
 
   static void __attribute__((noinline))
   Run(int32 height_block_number, int32 width_block_number,
-      const uint8* input_block_data, int8* scratch_block_data,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8* scratch_block_data,
       const DepthwiseConvDotProdParams* function_params) {
-    PreloadInputBlock<uint8>(input_block_data, function_params);
+    PreloadInputBlock(input_block_data, function_params);
     PackMacroBlockNeon(input_block_data, scratch_block_data, function_params);
   }
 };
 
-template <>
+template <QuantizationType quantization_type>
 struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
-                      QuantizationType::kNonPerChannelUint8,
+                      quantization_type,
                       DepthwiseConvDepthMultiplication::kNoMultiplication,
                       /*max_padding=*/1> {
   static inline void PackMacroBlockNeon(
       int32 height_block_number, int32 width_block_number,
-      const uint8* input_block_data, int8* scratch_block_data,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8* scratch_block_data,
       const DepthwiseConvDotProdParams* function_params) {
-    constexpr uint8 kSignBit = 0x80;
+    constexpr uint8 kSignBit =
+        QuantizationTypeImpl<quantization_type>::kUint8SignBit;
 
     const int workspace_height_stride =
         function_params->workspace_height_stride;
@@ -6157,7 +6185,8 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
     const int padding_bottom = function_params->padding_bottom;
 
     TFLITE_DCHECK_GT(depth_micro_repeats, 0);
-    constexpr int kSymmetricZeroPoint = 128;
+    constexpr int kSymmetricZeroPoint =
+        QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
 
     const int micro_block_size = 4 * 8;
     const int depth_advance = width_overall_micro_repeats * micro_block_size;
@@ -6188,7 +6217,7 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
     int8x16_t work_reg_b;
 
     // Effect subtraction of zero-point = 128 by XOR of sign bit.
-    const int8x16_t sign_bit = vdupq_n_s8(kSignBit);
+    const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
 
     // Work through one slice, by row, at a time.
     int8* scratch_data_0 = scratch_block_data;
@@ -6205,8 +6234,8 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
     }
 
     for (int k_height = 0; k_height < copy_block_height; ++k_height) {
-      const int8* input_data_0 =
-          reinterpret_cast<const int8*>(input_block_data);
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_data_0 = input_block_data;
       int8x16_t input_data_a;
       int8x16_t input_data_b;
       int8x16_t input_data_c;
@@ -6241,29 +6270,28 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
             if (depth_micro_repeats >= 2) {
               i_depth += 2;
 
-              //
-
-              input_data_a = vld1q_s8(input_data_0);
-              input_data_b = vld1q_s8(input_data_0 + 1 * input_depth);
-              input_data_c = vld1q_s8(input_data_0 + 2 * input_depth);
-              input_data_d = vld1q_s8(input_data_0 + 3 * input_depth);
+              input_data_a = util_vld1q_x8(input_data_0);
+              input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
+              input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
+              input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
               input_data_0 += 16;
 
-              //
-
               for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
                 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
                 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
                 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
-                work_reg_a = veorq_s8(work_reg_a, sign_bit);
-                work_reg_b = veorq_s8(work_reg_b, sign_bit);
+                if (quantization_type ==
+                    QuantizationType::kNonPerChannelUint8) {
+                  work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                  work_reg_b = veorq_s8(work_reg_b, sign_bit);
+                }
 
                 work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
                 work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
                 vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
 
-                input_data_a = vld1q_s8(input_data_0);
-                input_data_b = vld1q_s8(input_data_0 + 1 * input_depth);
+                input_data_a = util_vld1q_x8(input_data_0);
+                input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
                 optimized_ops_prefetch_write_l1_keep(scratch_data_0);
                 optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
                 vst1q_s8(scratch_data_0, work_reg_a);
@@ -6271,41 +6299,44 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
 
                 scratch_data_0 += depth_advance;
 
-                work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
-                work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+                if (quantization_type ==
+                    QuantizationType::kNonPerChannelUint8) {
+                  work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+                  work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+                }
 
-                input_data_c = vld1q_s8(input_data_0 + 2 * input_depth);
-                input_data_d = vld1q_s8(input_data_0 + 3 * input_depth);
+                input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
+                input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
                 optimized_ops_prefetch_write_l1_keep(scratch_data_0);
                 optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
                 vst1q_s8(scratch_data_0, work_reg_a_sp);
                 vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
 
                 scratch_data_0 += depth_advance;
-
-                //
-
                 input_data_0 += 16;
               }
 
               work_reg_a = vzip1q_s8(input_data_a, input_data_b);
               work_reg_b = vzip1q_s8(input_data_c, input_data_d);
               vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
-              work_reg_a = veorq_s8(work_reg_a, sign_bit);
-              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              }
               optimized_ops_prefetch_write_l1_keep(scratch_data_0);
               optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
               vst1q_s8(scratch_data_0, work_reg_a);
               vst1q_s8(scratch_data_0 + 16, work_reg_b);
 
               scratch_data_0 += depth_advance;
-              //
 
               work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
               work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
               vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
-              work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
-              work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+                work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+              }
 
               optimized_ops_prefetch_write_l1_keep(scratch_data_0);
               optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
@@ -6328,8 +6359,10 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
               input_data_0 += 8;
 
               vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
-              work_reg_a = veorq_s8(work_reg_a, sign_bit);
-              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              }
 
               optimized_ops_prefetch_write_l1_keep(scratch_data_0);
               optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
@@ -6343,10 +6376,10 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
           } else {
             TFLITE_DCHECK_LT(adjusted_residual_width, 4);
             for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
-              input_data_a = vdupq_n_s8(-input_offset);
-              input_data_b = vdupq_n_s8(-input_offset);
-              input_data_c = vdupq_n_s8(-input_offset);
-              input_data_d = vdupq_n_s8(-input_offset);
+              input_data_a = vdupq_n_u8(-input_offset);
+              input_data_b = vdupq_n_u8(-input_offset);
+              input_data_c = vdupq_n_u8(-input_offset);
+              input_data_d = vdupq_n_u8(-input_offset);
               if (adjusted_residual_width > 0) {
                 input_data_a = vld1q_lane_s8x8(input_data_0, input_data_a, 0);
                 if (adjusted_residual_width > 1) {
@@ -6361,8 +6394,10 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
               work_reg_a = vzip1q_s8(input_data_a, input_data_b);
               work_reg_b = vzip1q_s8(input_data_c, input_data_d);
 
-              work_reg_a = veorq_s8(work_reg_a, sign_bit);
-              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              }
               vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
 
               optimized_ops_prefetch_write_l1_keep(scratch_data_0);
@@ -6386,29 +6421,28 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
             if (depth_micro_repeats >= 2) {
               i_depth += 2;
 
-              //
-
-              input_data_a = vdupq_n_s8(-input_offset);
-              input_data_b = vld1q_s8(input_data_0 + 1 * input_depth);
-              input_data_c = vld1q_s8(input_data_0 + 2 * input_depth);
-              input_data_d = vld1q_s8(input_data_0 + 3 * input_depth);
+              input_data_a = vdupq_n_u8(-input_offset);
+              input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
+              input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
+              input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
               input_data_0 += 16;
 
-              //
-
               for (; i_depth < depth_micro_repeats - 1; i_depth += 2) {
                 work_reg_a = vzip1q_s8(input_data_a, input_data_b);
                 work_reg_b = vzip1q_s8(input_data_c, input_data_d);
                 vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
-                work_reg_a = veorq_s8(work_reg_a, sign_bit);
-                work_reg_b = veorq_s8(work_reg_b, sign_bit);
+                if (quantization_type ==
+                    QuantizationType::kNonPerChannelUint8) {
+                  work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                  work_reg_b = veorq_s8(work_reg_b, sign_bit);
+                }
 
                 work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
                 work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
                 vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
 
-                input_data_a = vdupq_n_s8(-input_offset);
-                input_data_b = vld1q_s8(input_data_0 + 1 * input_depth);
+                input_data_a = vdupq_n_u8(-input_offset);
+                input_data_b = util_vld1q_x8(input_data_0 + 1 * input_depth);
                 optimized_ops_prefetch_write_l1_keep(scratch_data_0);
                 optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
                 vst1q_s8(scratch_data_0, work_reg_a);
@@ -6416,41 +6450,44 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
 
                 scratch_data_0 += depth_advance;
 
-                work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
-                work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+                if (quantization_type ==
+                    QuantizationType::kNonPerChannelUint8) {
+                  work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+                  work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+                }
 
-                input_data_c = vld1q_s8(input_data_0 + 2 * input_depth);
-                input_data_d = vld1q_s8(input_data_0 + 3 * input_depth);
+                input_data_c = util_vld1q_x8(input_data_0 + 2 * input_depth);
+                input_data_d = util_vld1q_x8(input_data_0 + 3 * input_depth);
                 optimized_ops_prefetch_write_l1_keep(scratch_data_0);
                 optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
                 vst1q_s8(scratch_data_0, work_reg_a_sp);
                 vst1q_s8(scratch_data_0 + 16, work_reg_b_sp);
 
                 scratch_data_0 += depth_advance;
-
-                //
-
                 input_data_0 += 16;
               }
 
               work_reg_a = vzip1q_s8(input_data_a, input_data_b);
               work_reg_b = vzip1q_s8(input_data_c, input_data_d);
               vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
-              work_reg_a = veorq_s8(work_reg_a, sign_bit);
-              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              }
               optimized_ops_prefetch_write_l1_keep(scratch_data_0);
               optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
               vst1q_s8(scratch_data_0, work_reg_a);
               vst1q_s8(scratch_data_0 + 16, work_reg_b);
 
               scratch_data_0 += depth_advance;
-              //
 
               work_reg_a_sp = vzip2q_s8(input_data_a, input_data_b);
               work_reg_b_sp = vzip2q_s8(input_data_c, input_data_d);
               vzipq_s8x2_in_place(&work_reg_a_sp, &work_reg_b_sp);
-              work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
-              work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a_sp = veorq_s8(work_reg_a_sp, sign_bit);
+                work_reg_b_sp = veorq_s8(work_reg_b_sp, sign_bit);
+              }
 
               optimized_ops_prefetch_write_l1_keep(scratch_data_0);
               optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
@@ -6460,7 +6497,7 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
               scratch_data_0 += depth_advance;
             }
             for (; i_depth < depth_micro_repeats; ++i_depth) {
-              input_data_a = vdupq_n_s8(-input_offset);
+              input_data_a = vdupq_n_u8(-input_offset);
               input_data_b = vld1q_lane_s8x8(input_data_0 + 1 * input_depth,
                                              input_data_b, 0);
               input_data_c = vld1q_lane_s8x8(input_data_0 + 2 * input_depth,
@@ -6473,8 +6510,10 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
               input_data_0 += 8;
 
               vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
-              work_reg_a = veorq_s8(work_reg_a, sign_bit);
-              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              }
 
               optimized_ops_prefetch_write_l1_keep(scratch_data_0);
               optimized_ops_prefetch_write_l1_keep(scratch_data_0 + 16);
@@ -6489,10 +6528,10 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
             TFLITE_DCHECK_LT(adjusted_residual_width, 4);
 
             for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
-              input_data_a = vdupq_n_s8(-input_offset);
-              input_data_b = vdupq_n_s8(-input_offset);
-              input_data_c = vdupq_n_s8(-input_offset);
-              input_data_d = vdupq_n_s8(-input_offset);
+              input_data_a = vdupq_n_u8(-input_offset);
+              input_data_b = vdupq_n_u8(-input_offset);
+              input_data_c = vdupq_n_u8(-input_offset);
+              input_data_d = vdupq_n_u8(-input_offset);
               // Skip loading first column.
               if (adjusted_residual_width > 1) {
                 input_data_b = vld1q_lane_s8x8(input_data_0 + input_depth,
@@ -6505,8 +6544,10 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
               work_reg_a = vzip1q_s8(input_data_a, input_data_b);
               work_reg_b = vzip1q_s8(input_data_c, input_data_d);
 
-              work_reg_a = veorq_s8(work_reg_a, sign_bit);
-              work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+                work_reg_a = veorq_s8(work_reg_a, sign_bit);
+                work_reg_b = veorq_s8(work_reg_b, sign_bit);
+              }
               vzipq_s8x2_in_place(&work_reg_a, &work_reg_b);
 
               optimized_ops_prefetch_write_l1_keep(scratch_data_0);
@@ -6538,22 +6579,26 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
 
   static void __attribute__((noinline))
   Run(int32 height_block_number, int32 width_block_number,
-      const uint8* input_block_data, int8* scratch_block_data,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8* scratch_block_data,
       const DepthwiseConvDotProdParams* function_params) {
-    PreloadInputBlock<uint8>(input_block_data, function_params);
+    PreloadInputBlock(input_block_data, function_params);
     PackMacroBlockNeon(height_block_number, width_block_number,
                        input_block_data, scratch_block_data, function_params);
   }
 };
 
-template <>
+template <QuantizationType quantization_type>
 struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
-                      QuantizationType::kNonPerChannelUint8,
+                      quantization_type,
                       DepthwiseConvDepthMultiplication::kUnitInputDepth,
                       /*max_padding=*/1> {
   static inline void PackMacroBlockNeon(
       int32 height_block_number, int32 width_block_number,
-      const uint8* input_block_data, int8* scratch_block_data,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8* scratch_block_data,
       const DepthwiseConvDotProdParams* function_params) {
     const int workspace_height_stride =
         function_params->workspace_height_stride;
@@ -6570,7 +6615,8 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
     const int padding_top = function_params->padding_top;
     const int padding_bottom = function_params->padding_bottom;
 
-    constexpr int kSymmetricZeroPoint = 128;
+    constexpr int kSymmetricZeroPoint =
+        QuantizationTypeImpl<quantization_type>::kIntSymmetricZeroPoint;
 
     TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
 
@@ -6631,7 +6677,8 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
     int scratch_data_offset = 0;
     int input_block_offset = 0;
 
-    constexpr uint8 kSignBit = 0x80;
+    constexpr uint8 kSignBit =
+        QuantizationTypeImpl<quantization_type>::kUint8SignBit;
 
     // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
     // code. Note the blocks of 4x4 are still interleaved down the depth.
@@ -6640,8 +6687,8 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
     int8x8_t padding_mask;
 
     // Effect subtraction of zero-point = 128 by XOR of sign bit.
-    const int8x16_t sign_bit = vdupq_n_s8(kSignBit);
-    const int8x16_t padding_reg = vdupq_n_s8(-input_offset);
+    const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
+    const uint8x16_t padding_reg = vdupq_n_u8(-input_offset);
     padding_mask = vdup_n_s8(-1);
     half_work_reg = vdup_n_s8(0);
 
@@ -6660,10 +6707,11 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         // iteration of the main copy loop. In the case of leading width
         // padding, we unroll this specially.
         if (leading_width_padding) {
-          work_reg = vld1q_s8(reinterpret_cast<const int8*>(
-              input_block_data + input_block_offset));
+          work_reg = util_vld1q_x8(input_block_data + input_block_offset);
           work_reg = vextq_s8(padding_reg, work_reg, 15);
-          work_reg = veorq_s8(work_reg, sign_bit);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            work_reg = veorq_s8(work_reg, sign_bit);
+          }
           optimized_ops_prefetch_write_l1_keep(scratch_data);
           vst1q_s8(scratch_data, work_reg);
           copy_done += 15;
@@ -6671,9 +6719,11 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
 
         // Main copy loop.
         for (; (copy_done + 16) <= copy_size; copy_done += 16) {
-          work_reg = vld1q_s8(reinterpret_cast<const int8*>(
-              input_block_data + input_block_offset + copy_done));
-          work_reg = veorq_s8(work_reg, sign_bit);
+          work_reg =
+              util_vld1q_x8(input_block_data + input_block_offset + copy_done);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            work_reg = veorq_s8(work_reg, sign_bit);
+          }
           TFLITE_DCHECK_EQ((start_width + copy_done) % 16, 0);
           optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
                                                copy_done);
@@ -6681,9 +6731,11 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         }
 
         if (copy_done + 8 <= copy_size) {
-          half_work_reg = vld1_s8(reinterpret_cast<const int8*>(
-              input_block_data + input_block_offset + copy_done));
-          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          half_work_reg =
+              util_vld1_x8(input_block_data + input_block_offset + copy_done);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
           TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
           optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
                                                copy_done);
@@ -6703,16 +6755,17 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
           // Employ overlapping-load strategy in order to load full register,
           // but use only part.
           // This has the advantage of resulting in zeros after shifting.
-          half_work_reg = vld1_s8(reinterpret_cast<const int8*>(
-              input_block_data + input_block_offset + copy_size - 8));
-
+          half_work_reg = util_vld1_x8(input_block_data + input_block_offset +
+                                       copy_size - 8);
           half_work_reg = vreinterpret_s8_s64(
               vshl_s64(vreinterpret_s64_s8(half_work_reg),
                        vdup_n_s64(-8 * (8 - copy_remaining))));
           half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
                                   vget_low_s8(padding_reg), half_work_reg);
 
-          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
           TFLITE_DCHECK_EQ((start_width + copy_done) % 8, 0);
           optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
                                                copy_done);
@@ -6748,7 +6801,9 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
           half_work_reg = vld1_lane_8x4(input_block_data + input_block_offset,
                                         half_work_reg, 0);
           half_work_reg = vext_s8(vget_low_s8(padding_reg), half_work_reg, 7);
-          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
           optimized_ops_prefetch_write_l1_keep(scratch_data);
           vst1_lane_8x4(scratch_data, half_work_reg, 0);
           copy_done += 3;
@@ -6759,7 +6814,9 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
           half_work_reg =
               vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
                             half_work_reg, 0);
-          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
           TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
           optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
                                                copy_done);
@@ -6790,7 +6847,9 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
           half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
                                   vget_low_s8(padding_reg), half_work_reg);
 
-          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
           TFLITE_DCHECK_EQ((start_width + copy_done) % 4, 0);
           optimized_ops_prefetch_write_l1_keep(scratch_data + start_width +
                                                copy_done);
@@ -6823,7 +6882,7 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
       TFLITE_DCHECK(trailing_width_padding);
 
       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
-        half_work_reg = vdup_n_s8(-input_offset);
+        half_work_reg = vdup_n_u8(-input_offset);
         half_work_reg = vld1_lane_s8(reinterpret_cast<const int8*>(
                                          input_block_data + input_block_offset),
                                      half_work_reg, 1);
@@ -6836,7 +6895,9 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
                                                        input_block_offset + 2),
                          half_work_reg, 3);
 
-        half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+        if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+        }
         TFLITE_DCHECK_EQ(scratch_data_offset % 8, 0);
         optimized_ops_prefetch_write_l1_keep(scratch_data_base +
                                              scratch_data_offset);
@@ -6865,7 +6926,7 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
       padding_mask = vreinterpret_s8_s64(vshl_s64(
           vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
       if (leading_width_padding) {
-        padding_mask = vset_lane_s8(255, padding_mask, 0);
+        padding_mask = vset_lane_u8(255, padding_mask, 0);
       }
 
       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
@@ -6884,7 +6945,9 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
                                 vget_low_s8(padding_reg), half_work_reg);
 
-        half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+        if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+        }
         TFLITE_DCHECK_EQ(scratch_data_offset % 4, 0);
         optimized_ops_prefetch_write_l1_keep(scratch_data_base +
                                              scratch_data_offset);
@@ -6925,22 +6988,26 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
 
   static void __attribute__((noinline))
   Run(int32 height_block_number, int32 width_block_number,
-      const uint8* input_block_data, int8* scratch_block_data,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8* scratch_block_data,
       const DepthwiseConvDotProdParams* function_params) {
-    PreloadInputBlock<uint8>(input_block_data, function_params);
+    PreloadInputBlock(input_block_data, function_params);
     PackMacroBlockNeon(height_block_number, width_block_number,
                        input_block_data, scratch_block_data, function_params);
   }
 };
 
-template <>
+template <QuantizationType quantization_type>
 struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
-                      QuantizationType::kNonPerChannelUint8,
+                      quantization_type,
                       DepthwiseConvDepthMultiplication::kUnitInputDepth,
                       /*max_padding=*/0> {
   static inline void PackMacroBlockNeon(
       int32 height_block_number, int32 width_block_number,
-      const uint8* input_block_data, int8* scratch_block_data,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8* scratch_block_data,
       const DepthwiseConvDotProdParams* function_params) {
     const int workspace_height_stride =
         function_params->workspace_height_stride;
@@ -6980,7 +7047,8 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
     int scratch_data_offset = 0;
     int input_block_offset = 0;
 
-    constexpr uint8 kSignBit = 0x80;
+    constexpr uint8 kSignBit =
+        QuantizationTypeImpl<quantization_type>::kUint8SignBit;
 
     // Transpositions are 4x4, but doing 2 at a time is more efficient in NEON
     // code. Note the blocks of 4x4 are still interleaved down the depth.
@@ -6988,7 +7056,7 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
     int8x8_t half_work_reg;
 
     // Effect subtraction of zero-point = 128 by XOR of sign bit.
-    const int8x16_t sign_bit = vdupq_n_s8(kSignBit);
+    const uint8x16_t sign_bit = vdupq_n_u8(kSignBit);
     half_work_reg = vdup_n_s8(0);
 
     if (copy_size >= 16) {
@@ -7002,18 +7070,22 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
 
         // Main copy loop.
         for (; (copy_done + 16) <= copy_size; copy_done += 16) {
-          work_reg = vld1q_s8(reinterpret_cast<const int8*>(
-              input_block_data + input_block_offset + copy_done));
-          work_reg = veorq_s8(work_reg, sign_bit);
+          work_reg =
+              util_vld1q_x8(input_block_data + input_block_offset + copy_done);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            work_reg = veorq_s8(work_reg, sign_bit);
+          }
           TFLITE_DCHECK_EQ(copy_done % 16, 0);
           optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
           vst1q_s8(scratch_data + copy_done, work_reg);
         }
 
         if (copy_done + 8 <= copy_size) {
-          half_work_reg = vld1_s8(reinterpret_cast<const int8*>(
-              input_block_data + input_block_offset + copy_done));
-          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          half_work_reg =
+              util_vld1_x8(input_block_data + input_block_offset + copy_done);
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
           TFLITE_DCHECK_EQ(copy_done % 8, 0);
           optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
           vst1_s8(scratch_data + copy_done, half_work_reg);
@@ -7032,14 +7104,16 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
           // Employ overlapping-load strategy in order to load full register,
           // but use only part.
           // This has the advantage of resulting in zeros after shifting.
-          half_work_reg = vld1_s8(reinterpret_cast<const int8*>(
-              input_block_data + input_block_offset + copy_size - 8));
+          half_work_reg = util_vld1_x8(input_block_data + input_block_offset +
+                                       copy_size - 8);
 
           half_work_reg = vreinterpret_s8_s64(
               vshl_s64(vreinterpret_s64_s8(half_work_reg),
                        vdup_n_s64(-8 * (8 - copy_remaining))));
 
-          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
           TFLITE_DCHECK_EQ(copy_done % 8, 0);
           optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
           vst1_s8(scratch_data + copy_done, half_work_reg);
@@ -7069,7 +7143,9 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
           half_work_reg =
               vld1_lane_8x4(input_block_data + input_block_offset + copy_done,
                             half_work_reg, 0);
-          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
           TFLITE_DCHECK_EQ(copy_done % 4, 0);
           optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
           vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
@@ -7096,7 +7172,9 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
               vshl_s64(vreinterpret_s64_s8(half_work_reg),
                        vdup_n_s64(-8 * (4 - copy_remaining))));
 
-          half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          if (quantization_type == QuantizationType::kNonPerChannelUint8) {
+            half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
+          }
           TFLITE_DCHECK_EQ(copy_done % 4, 0);
           optimized_ops_prefetch_write_l1_keep(scratch_data + copy_done);
           vst1_lane_8x4(scratch_data + copy_done, half_work_reg, 0);
@@ -7159,9 +7237,11 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
 
   static void __attribute__((noinline))
   Run(int32 height_block_number, int32 width_block_number,
-      const uint8* input_block_data, int8* scratch_block_data,
+      const typename QuantizationTypeImpl<quantization_type>::ExternalType*
+          input_block_data,
+      int8* scratch_block_data,
       const DepthwiseConvDotProdParams* function_params) {
-    PreloadInputBlock<uint8>(input_block_data, function_params);
+    PreloadInputBlock(input_block_data, function_params);
     PackMacroBlockNeon(height_block_number, width_block_number,
                        input_block_data, scratch_block_data, function_params);
   }
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h
index cbc92157a18..7afdb98c496 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h
@@ -37,18 +37,6 @@ namespace depthwise_conv {
 
 #ifdef USE_NEON
 
-inline int8x16_t util_vld1q_x8(const uint8* data_addr) {
-  return vreinterpretq_s8_u8(vld1q_u8(data_addr));
-}
-inline int8x16_t util_vld1q_x8(const int8* data_addr) {
-  return vld1q_s8(data_addr);
-}
-inline int8x8_t util_vld1_x8(const uint8* data_addr) {
-  return vreinterpret_s8_u8(vld1_u8(data_addr));
-}
-inline int8x8_t util_vld1_x8(const int8* data_addr) {
-  return vld1_s8(data_addr);
-}
 inline void util_vst1_x8(uint8* data_addr, int8x8_t reg) {
   return vst1_u8(data_addr, vreinterpret_u8_s8(reg));
 }
@@ -1999,7 +1987,8 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
 
     if (copy_size >= 16) {
       const int copy_remaining = (copy_size + start_width) & 0x7;
-      padding_mask = vshl_u64(padding_mask, vdup_n_s64(8 * copy_remaining));
+      padding_mask = vreinterpret_s8_s64(vshl_s64(
+          vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
 
       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
         // Work through one slice, by row, at a time.
@@ -2057,10 +2046,11 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
           half_work_reg = util_vld1_x8(input_block_data + input_block_offset +
                                        copy_size - 8);
 
-          half_work_reg =
-              vshl_u64(half_work_reg, vdup_n_s64(-8 * (8 - copy_remaining)));
-          half_work_reg =
-              vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
+          half_work_reg = vreinterpret_s8_s64(
+              vshl_s64(vreinterpret_s64_s8(half_work_reg),
+                       vdup_n_s64(-8 * (8 - copy_remaining))));
+          half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
+                                  vget_low_s8(padding_reg), half_work_reg);
 
           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
             half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
@@ -2078,7 +2068,8 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
       }
     } else if (copy_size >= 4) {
       const int copy_remaining = (copy_size + start_width) & 0x3;
-      padding_mask = vshl_u64(padding_mask, vdup_n_s64(8 * copy_remaining));
+      padding_mask = vreinterpret_s8_s64(vshl_s64(
+          vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
 
       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
         // Work through one slice, by row, at a time.
@@ -2130,10 +2121,11 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
               input_block_data + input_block_offset + copy_size - 4,
               half_work_reg, 0);
 
-          half_work_reg =
-              vshl_u64(half_work_reg, vdup_n_s64(-8 * (4 - copy_remaining)));
-          half_work_reg =
-              vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
+          half_work_reg = vreinterpret_s8_s64(
+              vshl_s64(vreinterpret_s64_s8(half_work_reg),
+                       vdup_n_s64(-8 * (4 - copy_remaining))));
+          half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
+                                  vget_low_s8(padding_reg), half_work_reg);
 
           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
             half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
@@ -2199,24 +2191,27 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
     } else {
       TFLITE_DCHECK_EQ(width_overall_micro_repeats, 1);
       const int copy_remaining = (copy_size + start_width) & 0x3;
-      padding_mask = vshl_u64(padding_mask, vdup_n_s64(8 * copy_remaining));
+      padding_mask = vreinterpret_s8_s64(vshl_s64(
+          vreinterpret_s64_s8(padding_mask), vdup_n_s64(8 * copy_remaining)));
       if (leading_width_padding) {
         padding_mask = vset_lane_u8(255, padding_mask, 0);
       }
 
       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
         for (int i = 0; i < copy_size; ++i) {
-          half_work_reg = vshl_n_u64(half_work_reg, 8);
+          half_work_reg = vreinterpret_s8_s64(
+              vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
           half_work_reg = vld1_lane_s8(
               reinterpret_cast<const int8*>(
                   input_block_data + input_block_offset + copy_size - 1 - i),
               half_work_reg, 0);
         }
         if (leading_width_padding) {
-          half_work_reg = vshl_n_s64(half_work_reg, 8);
+          half_work_reg = vreinterpret_s8_s64(
+              vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
         }
-        half_work_reg =
-            vbsl_s8(padding_mask, vget_low_s8(padding_reg), half_work_reg);
+        half_work_reg = vbsl_s8(vreinterpret_u8_s8(padding_mask),
+                                vget_low_s8(padding_reg), half_work_reg);
 
         if (quantization_type == QuantizationType::kNonPerChannelUint8) {
           half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
@@ -2376,8 +2371,9 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
           half_work_reg = util_vld1_x8(input_block_data + input_block_offset +
                                        copy_size - 8);
 
-          half_work_reg =
-              vshl_u64(half_work_reg, vdup_n_s64(-8 * (8 - copy_remaining)));
+          half_work_reg = vreinterpret_s8_s64(
+              vshl_s64(vreinterpret_s64_s8(half_work_reg),
+                       vdup_n_s64(-8 * (8 - copy_remaining))));
 
           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
             half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
@@ -2432,8 +2428,9 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
               input_block_data + input_block_offset + copy_size - 4,
               half_work_reg, 0);
 
-          half_work_reg =
-              vshl_u64(half_work_reg, vdup_n_s64(-8 * (4 - copy_remaining)));
+          half_work_reg = vreinterpret_s8_s64(
+              vshl_s64(vreinterpret_s64_s8(half_work_reg),
+                       vdup_n_s64(-8 * (4 - copy_remaining))));
 
           if (quantization_type == QuantizationType::kNonPerChannelUint8) {
             half_work_reg = veor_s8(half_work_reg, vget_low_s8(sign_bit));
@@ -2456,7 +2453,8 @@ struct PackMacroBlock<DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct,
 
       for (int k_height = 0; k_height < copy_block_height; ++k_height) {
         for (int i = 0; i < copy_size; ++i) {
-          half_work_reg = vshl_n_u64(half_work_reg, 8);
+          half_work_reg = vreinterpret_s8_s64(
+              vshl_n_s64(vreinterpret_s64_s8(half_work_reg), 8));
           half_work_reg = vld1_lane_s8(
               reinterpret_cast<const int8*>(
                   input_block_data + input_block_offset + copy_size - 1 - i),

From 2cd63edfea15e4c9ad0f1e4529b885ce89c246a7 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Tue, 18 Feb 2020 09:28:09 -0800
Subject: [PATCH 125/442] Pip install setuptools for sanity builds.

Sanity builds install pylint, which in turn installs wrapt.
Wrapt has just released a new version which seems to require
setuptools, but doesn't automatically install this dependency.
So for now install the needed dependency first.

PiperOrigin-RevId: 295750882
Change-Id: I63beedcbb24a372c1c6062085dbfd0d7ab976ae0
---
 tensorflow/tools/ci_build/presubmit/ubuntu_16/sanity/build.sh | 1 +
 tensorflow/tools/ci_build/release/ubuntu_16/sanity/build.sh   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tensorflow/tools/ci_build/presubmit/ubuntu_16/sanity/build.sh b/tensorflow/tools/ci_build/presubmit/ubuntu_16/sanity/build.sh
index aa52c7619d0..d111a3bb658 100644
--- a/tensorflow/tools/ci_build/presubmit/ubuntu_16/sanity/build.sh
+++ b/tensorflow/tools/ci_build/presubmit/ubuntu_16/sanity/build.sh
@@ -28,6 +28,7 @@ function install_pylint () {
   # TODO(gunan): figure out why we get stuck with later versions of pylint.
   # TODO(mihaimaruseac): this is used in the release build in the same way,
   # maybe extract out to a common?
+  sudo python3 -m pip install setuptools --upgrade
   sudo python2 -m pip install pylint==1.6.4
   sudo python3 -m pip install pylint==1.6.4
 }
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/sanity/build.sh b/tensorflow/tools/ci_build/release/ubuntu_16/sanity/build.sh
index 06ab6b8f417..4fc600de867 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/sanity/build.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/sanity/build.sh
@@ -25,6 +25,7 @@ sudo pip3 install pep8
 
 # TODO(gunan): figure out why we get stuck with later versions of pylint.
 # Install pylint.
+sudo python3 -m pip install setuptools --upgrade
 sudo python2 -m pip install pylint==1.6.4
 sudo python3 -m pip install pylint==1.6.4
 

From 27e92df581abb859b8f85f59e907a88567e23a49 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Tue, 18 Feb 2020 09:44:40 -0800
Subject: [PATCH 126/442] Add visibility for bazel_pip prefixed packages.

This package prefix is used in open source Kokoro pip testing.
This fixes errors like this one:

target '//tensorflow/compiler/tests:xla_test' is not visible from target '//bazel_pip/tensorflow/compiler/tests:reshape_op_test_gpu'

PiperOrigin-RevId: 295754319
Change-Id: Id1d2f0c55df64a0505fa83db2961e06b33037323
---
 tensorflow/BUILD                | 6 ++++--
 tensorflow/compiler/tests/BUILD | 8 ++++++++
 tensorflow/python/BUILD         | 8 ++++++++
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 4c6f15f5367..31efafb7801 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -505,13 +505,15 @@ selects.config_setting_group(
 package_group(
     name = "internal",
     packages = [
+        # To pass open source testing in the pip Kokoros.
+        "//bazel_pip/tensorflow/...",
         "//learning/brain/swift/x10/...",
         "//perftools/accelerators/xprof/api/...",
+        "//third_party/py/autograph/...",
+        "//third_party/swift/tensorflow/x10/...",
         "//tensorflow/...",
         "//tensorflow_estimator/python/estimator/...",
         "//tensorflow_models/official/...",
-        "//third_party/py/autograph/...",
-        "//third_party/swift/tensorflow/x10/...",
     ],
 )
 
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index e4b06a2e539..cbe92235643 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -18,6 +18,10 @@ package_group(
     includes = [
         "//tensorflow/compiler/tf2xla:internal",
     ],
+    packages = [
+        # To pass open source testing in the pip Kokoros.
+        "//bazel_pip/tensorflow/compiler/tests/...",
+    ],
 )
 
 package_group(
@@ -25,6 +29,10 @@ package_group(
     includes = [
         "//tensorflow/compiler/tf2xla:friends",
     ],
+    packages = [
+        # To pass open source testing in the pip Kokoros.
+        "//bazel_pip/tensorflow/compiler/tests/...",
+    ],
 )
 
 generate_backend_suites()
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 398b56ca5fc..86a9530f337 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -2622,6 +2622,8 @@ tf_py_test(
 tf_gen_op_wrapper_private_py(
     name = "array_ops_gen",
     visibility = [
+        # To pass open source testing in the pip Kokoros.
+        "//bazel_pip/tensorflow/compiler/tests:__pkg__",
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/compiler/tests:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
@@ -2635,6 +2637,8 @@ tf_gen_op_wrapper_private_py(
 tf_gen_op_wrapper_private_py(
     name = "bitwise_ops_gen",
     visibility = [
+        # To pass open source testing in the pip Kokoros.
+        "//bazel_pip/tensorflow/compiler/tests:__pkg__",
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/compiler/tests:__pkg__",
         "//tensorflow/contrib/quantization:__pkg__",
@@ -2830,6 +2834,8 @@ tf_gen_op_wrapper_private_py(
 tf_gen_op_wrapper_private_py(
     name = "math_ops_gen",
     visibility = [
+        # To pass open source testing in the pip Kokoros.
+        "//bazel_pip/tensorflow/compiler/tests:__pkg__",
         "//learning/brain/google/python/ops:__pkg__",
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/compiler/tests:__pkg__",
@@ -2840,6 +2846,8 @@ tf_gen_op_wrapper_private_py(
 tf_gen_op_wrapper_private_py(
     name = "nn_ops_gen",
     visibility = [
+        # To pass open source testing in the pip Kokoros.
+        "//bazel_pip/tensorflow/compiler/tests:__pkg__",
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/compiler/tests:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",

From ed6e08a66005a18060e7c57605ca15b55b7fd4b8 Mon Sep 17 00:00:00 2001
From: Alex Stark <starka@google.com>
Date: Tue, 18 Feb 2020 09:46:22 -0800
Subject: [PATCH 127/442] Depthwise convolution 3x3 per-channel int8 for
 dot-product ARM (15).

Introduce ASM for per-channel case.

PiperOrigin-RevId: 295754673
Change-Id: I5698a1742be65dfdacdd9337e84781cac7235e3e
---
 .../internal/depthwiseconv_quantized_test.cc  |   38 +-
 .../depthwiseconv_uint8_3x3_filter.h          | 2914 ++++++++++++++++-
 2 files changed, 2941 insertions(+), 11 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
index e0855f8309f..b35a66d30f2 100644
--- a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
@@ -426,7 +426,23 @@ inline void DispatchDepthwiseConvImpl<QuantizationType::kPerChannelInt8>(
       // call this code.
 #if defined(__aarch64__) && !defined(GOOGLE_L4T) && defined(__ANDROID__) && \
     defined(__clang__)
-      // TODO(b/148145875): Implement ASM code for int8 per-channel.
+      DotProduct3x3KernelType kernel_type =
+          optimized_ops::depthwise_conv::CategorizeDotProductKernel<
+              QuantizationType::kPerChannelInt8>(
+              input_shape, filter_shape, output_shape, params,
+              params.output_shift_per_channel);
+
+      ASSERT_NE(kernel_type, DotProduct3x3KernelType::kNone)
+          << "Kernel type = " << static_cast<int>(kernel_type);
+
+      optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3Impl<
+          DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+          quantization_type>(
+          params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, output_data,
+          /*thread_start=*/0,
+          /*thread_end=*/output_shape.Dims(1), /*thread_dim=*/1);
+      return;
 #endif
       break;
     }
@@ -1253,7 +1269,7 @@ INSTANTIATE_TEST_SUITE_P(
     testing::Combine(
         Values(DepthwiseConvImplementation::
                    kUseIntrinsics3x3DotProduct),        // forced_invocation
-        Values(1000),                                   // tests_to_run
+        Values(500),                                    // tests_to_run
         Values(QuantizationType::kNonPerChannelUint8),  // quantization_type
         Bool(),                                         // test_stride
         Bool(),                                         // test_pad
@@ -1273,7 +1289,7 @@ INSTANTIATE_TEST_SUITE_P(
     testing::Combine(
         Values(DepthwiseConvImplementation::
                    kUseIntrinsics3x3DotProduct),       // forced_invocation
-        Values(1000),                                  // tests_to_run
+        Values(500),                                   // tests_to_run
         Values(QuantizationType::kPerChannelInt8),     // quantization_type
         Bool(),                                        // test_stride
         Bool(),                                        // test_pad
@@ -1305,6 +1321,22 @@ INSTANTIATE_TEST_SUITE_P(
         ),
     TestParam::TestNameSuffix);
 
+INSTANTIATE_TEST_SUITE_P(
+    NeonAsmPerChannel, DepthwiseConvTest,
+    testing::Combine(
+        Values(DepthwiseConvImplementation::
+                   kUseNeon3x3DotProduct),             // forced_invocation
+        Values(1000),                                  // tests_to_run
+        Values(QuantizationType::kPerChannelInt8),     // quantization_type
+        Bool(),                                        // test_stride
+        Bool(),                                        // test_pad
+        Bool(),                                        // test_depth_multiplier
+        Values(DepthwiseConvOutputRounding::kUpward),  // output_rounding
+        Values(1),                                     // num_threads
+        Values(false)                                  // loose_tolerance
+        ),
+    TestParam::TestNameSuffix);
+
 // Apply the 3x3 tests through the dispatch.
 // Also test multi-threading. This assumes upward rounding.
 INSTANTIATE_TEST_SUITE_P(
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index 7ff5018ba37..ff19d8282f3 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -198,6 +198,10 @@ static_assert(offsetof(DepthwiseConvParams, output_height) ==
 #define DP_OFFSET_WORKSPACE_HEIGHT_STRIDE DP_OFFSET_OUTPUT_HEIGHT_STRIDE + 4
 //
 #define DP_OFFSET_FOUR_OVER_STRIDE DP_OFFSET_WORKSPACE_HEIGHT_STRIDE + 4
+//
+#define DP_OFFSET_OUTPUT_MULTPLIPLIER_PER_CHANNEL DP_OFFSET_FOUR_OVER_STRIDE + 4
+#define DP_OFFSET_OUTPUT_SHIFT_PER_CHANNEL \
+  DP_OFFSET_OUTPUT_MULTPLIPLIER_PER_CHANNEL + 8
 
 static_assert(offsetof(DepthwiseConvDotProdParams, input_depth) ==
                   DP_OFFSET_INPUT_DEPTH,
@@ -298,6 +302,15 @@ static_assert(offsetof(DepthwiseConvDotProdParams, workspace_height_stride) ==
 static_assert(offsetof(DepthwiseConvDotProdParams, four_over_stride) ==
                   DP_OFFSET_FOUR_OVER_STRIDE,
               "");
+//
+static_assert(offsetof(DepthwiseConvDotProdParams,
+                       output_multiplier_per_channel) ==
+                  DP_OFFSET_OUTPUT_MULTPLIPLIER_PER_CHANNEL,
+              "");
+static_assert(offsetof(DepthwiseConvDotProdParams, output_shift_per_channel) ==
+                  DP_OFFSET_OUTPUT_SHIFT_PER_CHANNEL,
+              "");
+
 #endif  // __aarch64__ && !GOOGLE_L4T - Dot product ops hard-coded
 
 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
@@ -5908,9 +5921,11 @@ struct ProcessPerDepth<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "cc", "memory",
         // We use these NEON registers.
         "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
-        "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+        "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
         // We use these general-purpose registers.
-        "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x15", "x16");
+        "x8", "x9", "x10", "x11", "x12", "x13");
+#undef DC_PER_DEPTH_1
+#undef DC_PER_DEPTH_2
   }
 
   static void __attribute__((noinline))
@@ -5922,6 +5937,121 @@ struct ProcessPerDepth<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
   }
 };
 
+template <>
+struct ProcessPerDepth<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                       QuantizationType::kPerChannelInt8> {
+  static inline void ProcessPerDepthNeon(
+      const int8* filter_data, const int32* bias_data,
+      int8* shuffled_filter_data, int32* adjusted_bias_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    // Note that argument registers may be reused after parameter loading.
+    // x0 %[filter_data]
+    // x1 %[bias_data]
+    // x2 %[shuffled_filter_data]
+    // x3 %[adjusted_bias_data]
+    // x4 %[function_params]
+#define DC_PER_DEPTH_1 "1"
+#define DC_PER_DEPTH_2 "2"
+#define DC_PER_DEPTH_3 "3"
+
+    asm volatile(        // %bb.0:
+        "ldr    w8, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
+        "cmp    w8, #1\n"  // =1
+        "b.lt   " DC_PER_DEPTH_3 "f\n"
+        // %bb.1:
+        "add    x10, %[function_params], #" STR(DP_OFFSET_INPUT_OFFSET) "\n"  // =24
+        "ldrsw  x11, [%[function_params], #" STR(DP_OFFSET_BIAS_INCREMENT) "]\n"
+        "ldrsw  x9, [%[function_params], #" STR(DP_OFFSET_OUTPUT_DEPTH) "]\n"
+        "ld1r   { v1.4s }, [x10]\n"
+        "movi   v0.16b, #0\n"
+        "lsl    x10, x11, #2\n"
+        "lsl    x11, x11, #3\n"
+        "movi   v2.16b, #1\n"
+        "mov    x12, %[filter_data]\n"
+        // implicit-def: $q3
+        // implicit-def: $q4
+        // implicit-def: $q5
+        // implicit-def: $q6
+        // implicit-def: $q7
+        // implicit-def: $q16
+        // implicit-def: $q17
+        // implicit-def: $q18
+        // implicit-def: $q19
+        DC_PER_DEPTH_2 ":\n"  // =>This Inner Loop Header: Depth=1
+        "add    x13, %[filter_data], x9\n"
+        "ld1    { v3.d }[0], [x12], #8\n"
+        "ld1    { v4.d }[0], [x13], x9\n"
+        "movi   v21.16b, #0\n"
+        "movi   v20.16b, #0\n"
+        "subs   w8, w8, #1\n"  // =1
+        "ld1    { v5.d }[0], [x13], x9\n"
+        "zip1   v22.16b, v3.16b, v4.16b\n"
+        "mov    %[filter_data], x12\n"
+        "ld1    { v6.d }[0], [x13], x9\n"
+        "zip1   v23.16b, v5.16b, v0.16b\n"
+        "zip1   v24.8h, v22.8h, v23.8h\n"
+        "zip2   v22.8h, v22.8h, v23.8h\n"
+        "ld1    { v7.d }[0], [x13], x9\n"
+        ".word 0x4e8296d5  // sdot   v21.4s, v22.16b, v2.16b\n"
+        ".word 0x4e829714  // sdot   v20.4s, v24.16b, v2.16b\n"
+        "ld1    { v16.d }[0], [x13], x9\n"
+        "zip1   v23.16b, v6.16b, v7.16b\n"
+        "ld1    { v17.d }[0], [x13], x9\n"
+        "zip1   v25.16b, v16.16b, v0.16b\n"
+        "zip1   v26.8h, v23.8h, v25.8h\n"
+        "zip2   v23.8h, v23.8h, v25.8h\n"
+        "ld1    { v18.d }[0], [x13], x9\n"
+        ".word 0x4e8296f5  // sdot   v21.4s, v23.16b, v2.16b\n"
+        ".word 0x4e829754  // sdot   v20.4s, v26.16b, v2.16b\n"
+        "ld1    { v19.d }[0], [x13]\n"
+        "zip1   v25.16b, v17.16b, v18.16b\n"
+        "stp    q24, q22, [%[shuffled_filter_data]]\n"
+        "stp    q26, q23, [%[shuffled_filter_data], #32]\n"
+        "zip1   v22.16b, v19.16b, v0.16b\n"
+        "zip1   v23.8h, v25.8h, v22.8h\n"
+        "zip2   v22.8h, v25.8h, v22.8h\n"
+        "stp    q23, q22, [%[shuffled_filter_data], #64]\n"
+        ".word 0x4e8296f4  // sdot   v20.4s, v23.16b, v2.16b\n"
+        ".word 0x4e8296d5  // sdot   v21.4s, v22.16b, v2.16b\n"
+        "ldr    q22, [%[bias_data]]\n"
+        "ldr    q23, [%[bias_data], x10]\n"
+        "add    %[shuffled_filter_data], x2, #96\n"  // =96
+        "add    %[bias_data], x1, x11\n"
+        "mla    v22.4s, v20.4s, v1.4s\n"
+        "mla    v23.4s, v21.4s, v1.4s\n"
+        "stp    q22, q23, [%[adjusted_bias_data]], #32\n"
+        "b.ne   " DC_PER_DEPTH_2 "b\n"
+        DC_PER_DEPTH_3 ":\n"
+        :
+        // Outputs.
+        [ filter_data ] "+r"(filter_data),
+        [ bias_data ] "+r"(bias_data),
+        [ shuffled_filter_data ] "+r"(shuffled_filter_data),
+        [ adjusted_bias_data ] "+r"(adjusted_bias_data)
+        :
+        // Inputs.
+        [ function_params ] "r"(function_params)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+        // We use these general-purpose registers.
+        "x8", "x9", "x10", "x11", "x12", "x13");
+#undef DC_PER_DEPTH_1
+#undef DC_PER_DEPTH_2
+#undef DC_PER_DEPTH_3
+  }
+
+  static void __attribute__((noinline))
+  Run(const int8* filter_data, const int32* bias_data,
+      int8* shuffled_filter_data, int32* adjusted_bias_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    ProcessPerDepthNeon(filter_data, bias_data, shuffled_filter_data,
+                        adjusted_bias_data, function_params);
+  }
+};
 template <QuantizationType quantization_type>
 struct PackMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
                       quantization_type,
@@ -8014,8 +8144,6 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
         "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
         "x27", "x28");
-  }  // NOLINT(readability/fn_size) Manually unrolled.
-
 #undef DC_KERNEL_NO_MULT_1
 #undef DC_KERNEL_NO_MULT_2
 #undef DC_KERNEL_NO_MULT_3
@@ -8051,6 +8179,7 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
 #undef DC_KERNEL_NO_MULT_33
 #undef DC_KERNEL_NO_MULT_34
 #undef DC_KERNEL_NO_MULT_35
+  }  // NOLINT(readability/fn_size) Manually unrolled.
 
   static void __attribute__((noinline))
   Run(const int8* scratch_block_data, const int8* filter_workspace,
@@ -8713,9 +8842,8 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "v31",
         // We use these general-purpose registers.
         "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
-        "x16", "x17", "x19",  "x20", "x21", "x22", "x23", "x24", "x25", "x26",
+        "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
         "x27", "x28");
-  }  // NOLINT(readability/fn_size) Manually unrolled.
 
 #undef DC_KERNEL_NO_MULT_STRIDE_1
 #undef DC_KERNEL_NO_MULT_STRIDE_2
@@ -8752,6 +8880,7 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
 #undef DC_KERNEL_NO_MULT_STRIDE_33
 #undef DC_KERNEL_NO_MULT_STRIDE_34
 #undef DC_KERNEL_NO_MULT_STRIDE_35
+  }  // NOLINT(readability/fn_size) Manually unrolled.
 
   static void __attribute__((noinline))
   Run(const int8* scratch_block_data, const int8* filter_workspace,
@@ -9413,7 +9542,6 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
         "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
         "x27", "x28");
-  }  // NOLINT(readability/fn_size) Manually unrolled.
 
 #undef DC_KERNEL_MULT_1
 #undef DC_KERNEL_MULT_2
@@ -9437,6 +9565,7 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
 #undef DC_KERNEL_MULT_20
 #undef DC_KERNEL_MULT_21
 #undef DC_KERNEL_MULT_22
+  }  // NOLINT(readability/fn_size) Manually unrolled.
 
   static void __attribute__((noinline))
   Run(const int8* scratch_block_data, const int8* filter_workspace,
@@ -9805,7 +9934,6 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
         // We use these general-purpose registers.
         "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
         "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25");
-  }
 
 #undef DC_KERNEL_MULT_STRIDE_1
 #undef DC_KERNEL_MULT_STRIDE_2
@@ -9820,6 +9948,7 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
 #undef DC_KERNEL_MULT_STRIDE_11
 #undef DC_KERNEL_MULT_STRIDE_12
 #undef DC_KERNEL_MULT_STRIDE_13
+  }
 
   static void __attribute__((noinline))
   Run(const int8* scratch_block_data, const int8* filter_workspace,
@@ -9830,6 +9959,2775 @@ struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
   }
 };
 
+template <>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                        QuantizationType::kPerChannelInt8,
+                        DepthwiseConvDepthMultiplication::kNoMultiplication,
+                        /*stride=*/1> {
+  static inline void KernelMacroBlockNeon(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, int8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    // Note that argument registers may be reused after parameter loading.
+    // x0 %[scratch_block_data]
+    // x1 %[filter_workspace]
+    // x2 %[bias_data]
+    // x3 %[output_block_data]
+    // x4 %[function_params]
+#define DC_KERNEL_NO_MULT_1 "1"
+#define DC_KERNEL_NO_MULT_2 "2"
+#define DC_KERNEL_NO_MULT_3 "3"
+#define DC_KERNEL_NO_MULT_4 "4"
+#define DC_KERNEL_NO_MULT_5 "5"
+#define DC_KERNEL_NO_MULT_6 "6"
+#define DC_KERNEL_NO_MULT_7 "7"
+#define DC_KERNEL_NO_MULT_8 "8"
+#define DC_KERNEL_NO_MULT_9 "9"
+#define DC_KERNEL_NO_MULT_10 "10"
+#define DC_KERNEL_NO_MULT_11 "11"
+#define DC_KERNEL_NO_MULT_12 "12"
+#define DC_KERNEL_NO_MULT_13 "13"
+#define DC_KERNEL_NO_MULT_14 "14"
+#define DC_KERNEL_NO_MULT_15 "15"
+#define DC_KERNEL_NO_MULT_16 "16"
+#define DC_KERNEL_NO_MULT_17 "17"
+#define DC_KERNEL_NO_MULT_18 "18"
+#define DC_KERNEL_NO_MULT_19 "19"
+#define DC_KERNEL_NO_MULT_20 "20"
+#define DC_KERNEL_NO_MULT_21 "21"
+#define DC_KERNEL_NO_MULT_22 "22"
+#define DC_KERNEL_NO_MULT_23 "23"
+#define DC_KERNEL_NO_MULT_24 "24"
+#define DC_KERNEL_NO_MULT_25 "25"
+#define DC_KERNEL_NO_MULT_26 "26"
+#define DC_KERNEL_NO_MULT_27 "27"
+#define DC_KERNEL_NO_MULT_28 "28"
+#define DC_KERNEL_NO_MULT_29 "29"
+#define DC_KERNEL_NO_MULT_30 "30"
+#define DC_KERNEL_NO_MULT_31 "31"
+#define DC_KERNEL_NO_MULT_32 "32"
+#define DC_KERNEL_NO_MULT_33 "33"
+
+    asm volatile(
+        // Compiled code used block of 384 for spill out of total stack of 528.
+        "sub    sp, sp, #384\n"  // =528
+        "ldr    w8, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
+        "str    %[scratch_block_data], [sp, #376]\n"  // 8-byte Folded Spill
+        "cmp    w8, #1\n"  // =1
+        "str    x8, [sp, #56]\n"  // 8-byte Folded Spill
+        "b.lt   " DC_KERNEL_NO_MULT_33 "f\n"
+        // %bb.1:
+        "stp    xzr, xzr, [sp, #72]\n"  // 16-byte Folded Spill
+        "ldr    w8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
+        "str    xzr, [sp, #88]\n"  // 8-byte Folded Spill
+        "ldpsw  x22, x5, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
+        "ldr    x11, [%[function_params], #" STR(DP_OFFSET_OUTPUT_MULTPLIPLIER_PER_CHANNEL) "]\n"
+        "str    w8, [sp, #340]\n"  // 4-byte Folded Spill
+        "ldr    w8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_MICRO_REPEATS) "]\n"
+        "ldrb   w9, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "]\n"
+        "str    x11, [sp, #40]\n"  // 8-byte Folded Spill
+        "ldr    x11, [%[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT_PER_CHANNEL) "]\n"
+        "str    w8, [sp, #344]\n"  // 4-byte Folded Spill
+        "ldr    w8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
+        "ldrsw  x7, [%[function_params]]\n"
+        "str    x11, [sp, #32]\n"  // 8-byte Folded Spill
+        "ldrsw  x11, [%[function_params], #" STR(DP_OFFSET_INPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
+        "str    w8, [sp, #348]\n"  // 4-byte Folded Spill
+        "ldrb   w8, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "]\n"
+        "ldr    x26, [sp, #376]\n"  // 8-byte Folded Reload
+        "mov    x23, %[output_block_data]\n"
+        "add    x10, %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n"  // =28
+        "dup    v5.16b, w8\n"
+        "fmov   s3, w8\n"
+        "lsl    x8, x11, #5\n"
+        "dup    v6.16b, w9\n"
+        "fmov   s4, w9\n"
+        "str    x8, [sp, #48]\n"  // 8-byte Folded Spill
+        "add    x8, x5, x26\n"
+        "lsl    x9, x7, #1\n"
+        "ld1r   { v0.8h }, [x10]\n"
+        "add    x13, x5, x5, lsl #1\n"
+        "add    x10, x22, x7\n"
+        "add    x28, x8, #32\n"  // =32
+        "add    x8, x23, x9\n"
+        "str    x13, [sp, #312]\n"  // 8-byte Folded Spill
+        "add    x13, x13, x26\n"
+        "str    x8, [sp, #360]\n"  // 8-byte Folded Spill
+        "add    x8, x23, x10\n"
+        "str    x8, [sp, #352]\n"  // 8-byte Folded Spill
+        "add    x8, x13, #32\n"  // =32
+        "ldr    w6, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
+        "lsl    x12, x5, #2\n"
+        "add    x11, x5, x5, lsl #2\n"
+        "add    x24, x22, x22, lsl #1\n"
+        "str    x8, [sp, #368]\n"  // 8-byte Folded Spill
+        "lsl    x8, x5, #1\n"
+        "mov    %[output_block_data], %[filter_workspace]\n"
+        "lsl    %[filter_workspace], x22, #1\n"
+        "stp    x11, x12, [sp, #296]\n"  // 16-byte Folded Spill
+        "add    x11, x11, x26\n"
+        "add    x12, x12, x26\n"
+        "add    x14, x9, x7\n"
+        "add    x15, x9, x24\n"
+        "stp    x8, x5, [sp, #320]\n"  // 16-byte Folded Spill
+        "add    x8, x8, x26\n"
+        "add    x10, x11, #32\n"  // =32
+        "add    x11, x12, #32\n"  // =32
+        "add    x19, x8, #32\n"  // =32
+        "add    x12, x14, x24\n"
+        "add    x13, x14, %[filter_workspace]\n"
+        "add    x8, x14, x22\n"
+        "add    x25, x23, x14\n"
+        "add    x14, x23, x15\n"
+        "add    x17, x9, x22\n"
+        "mov    %[scratch_block_data], x19\n"
+        "mov    x19, x14\n"
+        "add    x14, x24, x7\n"
+        "add    x21, x23, x17\n"
+        "mov    w17, w6\n"
+        "add    x15, x23, x14\n"
+        "add    x14, %[filter_workspace], x7\n"
+        "add    x6, x23, x12\n"
+        "add    x12, x23, x13\n"
+        "add    %[function_params], x23, x14\n"
+        "mov    x14, x12\n"
+        "and    w12, w17, #0xfffffffe\n"
+        "str    w12, [sp, #20]\n"  // 4-byte Folded Spill
+        "lsl    x12, x7, #2\n"
+        "str    x12, [sp, #152]\n"  // 8-byte Folded Spill
+        "add    x12, x23, x22\n"
+        "str    x12, [sp, #144]\n"  // 8-byte Folded Spill
+        "add    x12, x23, x7\n"
+        "add    x16, x9, %[filter_workspace]\n"
+        "str    x12, [sp, #136]\n"  // 8-byte Folded Spill
+        "add    x12, x23, %[filter_workspace]\n"
+        "dup    v7.8b, v3.b[0]\n"
+        "dup    v14.8b, v4.b[0]\n"
+        "add    x20, x23, x16\n"
+        "mov    x13, x15\n"
+        "add    x15, x23, x8\n"
+        "mov    x5, %[filter_workspace]\n"
+        "str    x12, [sp, #128]\n"  // 8-byte Folded Spill
+        "mov    x8, x24\n"
+        "add    x12, x23, x24\n"
+        "mov    w1, #4\n"
+        "stp    x23, x12, [sp, #112]\n"  // 16-byte Folded Spill
+        "str    x26, [sp, #264]\n"  // 8-byte Folded Spill
+        "str    x22, [sp, #200]\n"  // 8-byte Folded Spill
+        "str    w17, [sp, #108]\n"  // 4-byte Folded Spill
+        "str    %[scratch_block_data], [sp, #96]\n"  // 8-byte Folded Spill
+        "str    x23, [sp, #24]\n"  // 8-byte Folded Spill
+        "stp    d14, d7, [sp, #160]\n"  // 16-byte Folded Spill
+        "b      " DC_KERNEL_NO_MULT_4 "f\n"
+        DC_KERNEL_NO_MULT_2 ":\n"  // in Loop: Header=BB111_4 Depth=1
+        "mov    %[bias_data], x9\n"
+        DC_KERNEL_NO_MULT_3 ":\n"  // in Loop: Header=BB111_4 Depth=1
+        "ldr    %[output_block_data], [sp, #48]\n"  // 8-byte Folded Reload
+        "ldr    x12, [sp, #264]\n"  // 8-byte Folded Reload
+        "ldr    x17, [sp, #88]\n"  // 8-byte Folded Reload
+        "add    x12, x12, %[output_block_data]\n"
+        "str    x12, [sp, #264]\n"  // 8-byte Folded Spill
+        "ldr    x12, [sp, #112]\n"  // 8-byte Folded Reload
+        "add    x17, x17, #1\n"  // =1
+        "add    x12, x12, #8\n"  // =8
+        "str    x12, [sp, #112]\n"  // 8-byte Folded Spill
+        "ldr    x12, [sp, #72]\n"  // 8-byte Folded Reload
+        "add    x12, x12, %[output_block_data]\n"
+        "str    x12, [sp, #72]\n"  // 8-byte Folded Spill
+        "ldp    x12, %[output_block_data], [sp, #56]\n"  // 16-byte Folded Reload
+        "cmp    x17, x12\n"
+        "ldr    x12, [sp, #80]\n"  // 8-byte Folded Reload
+        "add    x12, x12, #8\n"  // =8
+        "stp    x12, x17, [sp, #80]\n"  // 16-byte Folded Spill
+        "ldr    w17, [sp, #108]\n"  // 4-byte Folded Reload
+        "b.eq   " DC_KERNEL_NO_MULT_33 "f\n"
+        DC_KERNEL_NO_MULT_4 ":\n"  // =>This Loop Header: Depth=1
+        // Child Loop BB111_29 Depth 2
+        // Child Loop BB111_32 Depth 2
+        // Child Loop BB111_20 Depth 2
+        // Child Loop BB111_22 Depth 3
+        // Child Loop BB111_25 Depth 4
+        // Child Loop BB111_7 Depth 2
+        // Child Loop BB111_9 Depth 3
+        // Child Loop BB111_15 Depth 3
+        "ldp    q16, q15, [%[output_block_data]]\n"
+        "ldp    q17, q3, [%[output_block_data], #32]\n"
+        "ldp    q18, q4, [%[output_block_data], #64]\n"
+        "cmp    w17, #4\n"  // =4
+        "add    %[output_block_data], x3, #96\n"  // =96
+        "str    %[output_block_data], [sp, #64]\n"  // 8-byte Folded Spill
+        "b.ne   " DC_KERNEL_NO_MULT_16 "f\n"
+        // %bb.5:        // in Loop: Header=BB111_4 Depth=1
+        "ldp    x24, x12, [sp, #80]\n"  // 16-byte Folded Reload
+        "ldr    x17, [sp, #32]\n"  // 8-byte Folded Reload
+        "ldr    x26, [sp, #72]\n"  // 8-byte Folded Reload
+        "mov    x9, xzr\n"
+        "lsl    w12, w12, #3\n"
+        "lsl    x12, x12, #2\n"
+        "add    x16, x17, x12\n"
+        "ldr    x17, [sp, #40]\n"  // 8-byte Folded Reload
+        "stp    q4, q3, [sp, #224]\n"  // 32-byte Folded Spill
+        "str    q15, [sp, #176]\n"  // 16-byte Folded Spill
+        "add    x12, x17, x12\n"
+        "stp    x12, x16, [sp, #208]\n"  // 16-byte Folded Spill
+        "b      " DC_KERNEL_NO_MULT_7 "f\n"
+        DC_KERNEL_NO_MULT_6 ":\n"  // in Loop: Header=BB111_7 Depth=2
+        "ldp    q18, q17, [sp, #224]\n"  // 32-byte Folded Reload
+        "add    x9, x9, #1\n"  // =1
+        "add    x26, x26, #16\n"  // =16
+        "cmp    x9, #2\n"  // =2
+        "add    x24, x24, #4\n"  // =4
+        "mov    v16.16b, v15.16b\n"
+        "b.eq   " DC_KERNEL_NO_MULT_3 "b\n"
+        DC_KERNEL_NO_MULT_7 ":\n"  // Parent Loop BB111_4 Depth=1
+        // =>  This Loop Header: Depth=2
+        // Child Loop BB111_9 Depth 3
+        // Child Loop BB111_15 Depth 3
+        "ldr    q19, [%[bias_data]], #16\n"
+        "ldr    x16, [sp, #264]\n"  // 8-byte Folded Reload
+        "lsl    x12, x9, #4\n"
+        "ldr    w17, [sp, #344]\n"  // 4-byte Folded Reload
+        "mov    v31.16b, v19.16b\n"
+        "add    %[output_block_data], x16, x12\n"
+        "ldr    x16, [sp, #216]\n"  // 8-byte Folded Reload
+        "ldr    q22, [%[output_block_data]]\n"
+        "mov    v8.16b, v19.16b\n"
+        "mov    v9.16b, v19.16b\n"
+        "ldr    q20, [x16, x12]\n"
+        "ldr    x16, [sp, #208]\n"  // 8-byte Folded Reload
+        "mov    v10.16b, v19.16b\n"
+        "cmp    w17, #1\n"  // =1
+        "ldr    q21, [x16, x12]\n"
+        "ldr    x12, [sp, #328]\n"  // 8-byte Folded Reload
+        "ldr    q27, [%[output_block_data], x12]\n"
+        "ldr    x12, [sp, #320]\n"  // 8-byte Folded Reload
+        "ldr    q26, [%[output_block_data], x12]\n"
+        "ldr    x12, [sp, #312]\n"  // 8-byte Folded Reload
+        ".word 0x4e9a965f  // sdot   v31.4s, v18.16b, v26.16b\n"
+        "ldr    q25, [%[output_block_data], x12]\n"
+        "ldr    x12, [sp, #304]\n"  // 8-byte Folded Reload
+        ".word 0x4e9a9628  // sdot   v8.4s, v17.16b, v26.16b\n"
+        ".word 0x4e9a9609  // sdot   v9.4s, v16.16b, v26.16b\n"
+        ".word 0x4e99960a  // sdot   v10.4s, v16.16b, v25.16b\n"
+        "ldr    q24, [%[output_block_data], x12]\n"
+        "ldr    x12, [sp, #296]\n"  // 8-byte Folded Reload
+        "ldr    q23, [%[output_block_data], x12]\n"
+        "b.lt   " DC_KERNEL_NO_MULT_11 "f\n"
+        // %bb.8:        // in Loop: Header=BB111_7 Depth=2
+        "stp    x24, x9, [sp, #280]\n"  // 16-byte Folded Spill
+        "ldr    w12, [sp, #344]\n"  // 4-byte Folded Reload
+        "mov    x17, x24\n"
+        "str    x26, [sp, #272]\n"  // 8-byte Folded Spill
+        "mov    x22, x26\n"
+        "ldp    x27, x24, [sp, #144]\n"  // 16-byte Folded Reload
+        "ldp    x26, %[filter_workspace], [sp, #128]\n"  // 16-byte Folded Reload
+        "ldr    x16, [sp, #120]\n"  // 8-byte Folded Reload
+        "shl    v28.4s, v16.4s, #8\n"
+        "shl    v29.4s, v17.4s, #8\n"
+        "shl    v30.4s, v18.4s, #8\n"
+        "mov    v11.16b, v23.16b\n"
+        "mov    v12.16b, v24.16b\n"
+        "mov    v13.16b, v27.16b\n"
+        "mov    v14.16b, v22.16b\n"
+        DC_KERNEL_NO_MULT_9 ":\n"  // Parent Loop BB111_4 Depth=1
+        // Parent Loop BB111_7 Depth=2
+        // =>  This Inner Loop Header: Depth=3
+        ".word 0x4e8e961f  // sdot   v31.4s, v16.16b, v14.16b\n"
+        ".word 0x4e8d9608  // sdot   v8.4s, v16.16b, v13.16b\n"
+        ".word 0x4e999629  // sdot   v9.4s, v17.16b, v25.16b\n"
+        ".word 0x4e8d963f  // sdot   v31.4s, v17.16b, v13.16b\n"
+        ".word 0x4e8c962a  // sdot   v10.4s, v17.16b, v12.16b\n"
+        ".word 0x4e999648  // sdot   v8.4s, v18.16b, v25.16b\n"
+        ".word 0x4e8c9649  // sdot   v9.4s, v18.16b, v12.16b\n"
+        "sqrdmulh        v31.4s, v31.4s, v21.4s\n"
+        ".word 0x4e8b964a  // sdot   v10.4s, v18.16b, v11.16b\n"
+        "sqrdmulh        v8.4s, v8.4s, v21.4s\n"
+        "sqrdmulh        v9.4s, v9.4s, v21.4s\n"
+        "sqrshl v31.4s, v31.4s, v20.4s\n"
+        "sqrdmulh        v10.4s, v10.4s, v21.4s\n"
+        "sqrshl v8.4s, v8.4s, v20.4s\n"
+        "sqrshl v9.4s, v9.4s, v20.4s\n"
+        "sqxtn  v31.4h, v31.4s\n"
+        "sqrshl v10.4s, v10.4s, v20.4s\n"
+        "sqxtn  v9.4h, v9.4s\n"
+        "sqxtn2 v31.8h, v8.4s\n"
+        "sqxtn2 v9.8h, v10.4s\n"
+        "sqadd  v31.8h, v31.8h, v0.8h\n"
+        "sqadd  v8.8h, v9.8h, v0.8h\n"
+        "sqxtn  v31.8b, v31.8h\n"
+        "sqxtn2 v31.16b, v8.8h\n"
+        "smax   v31.16b, v31.16b, v5.16b\n"
+        "add    %[output_block_data], x27, x17\n"
+        "smin   v31.16b, v31.16b, v6.16b\n"
+        "str    s31, [x23, x17]\n"
+        "st1    { v31.s }[1], [%[output_block_data]]\n"
+        "add    %[output_block_data], x26, x17\n"
+        "st1    { v31.s }[2], [%[output_block_data]]\n"
+        "add    %[output_block_data], x16, x17\n"
+        "st1    { v31.s }[3], [%[output_block_data]]\n"
+        "ldr    %[output_block_data], [sp, #376]\n"  // 8-byte Folded Reload
+        "mov    v10.16b, v19.16b\n"
+        "mov    v31.16b, v19.16b\n"
+        "mov    v8.16b, v19.16b\n"
+        "ldr    x9, [sp, #352]\n"  // 8-byte Folded Reload
+        ".word 0x4e99978a  // sdot   v10.4s, v28.16b, v25.16b\n"
+        ".word 0x4e8e979f  // sdot   v31.4s, v28.16b, v14.16b\n"
+        ".word 0x4e8d9788  // sdot   v8.4s, v28.16b, v13.16b\n"
+        ".word 0x4e8c97aa  // sdot   v10.4s, v29.16b, v12.16b\n"
+        "mov    v9.16b, v19.16b\n"
+        ".word 0x4e8d97bf  // sdot   v31.4s, v29.16b, v13.16b\n"
+        ".word 0x4e9a97a8  // sdot   v8.4s, v29.16b, v26.16b\n"
+        ".word 0x4e8b97ca  // sdot   v10.4s, v30.16b, v11.16b\n"
+        "add    %[output_block_data], x3, x22\n"
+        "rev32  v2.8h, v26.8h\n"
+        ".word 0x4e9a9789  // sdot   v9.4s, v28.16b, v26.16b\n"
+        ".word 0x4e9a97df  // sdot   v31.4s, v30.16b, v26.16b\n"
+        ".word 0x4e9997c8  // sdot   v8.4s, v30.16b, v25.16b\n"
+        "sqrdmulh        v26.4s, v10.4s, v21.4s\n"
+        "rev32  v15.8h, v22.8h\n"
+        "ldr    q22, [%[output_block_data], #32]\n"
+        "add    %[output_block_data], x9, x17\n"
+        "rev32  v4.8h, v24.8h\n"
+        ".word 0x4e9997a9  // sdot   v9.4s, v29.16b, v25.16b\n"
+        "sqrdmulh        v24.4s, v8.4s, v21.4s\n"
+        "sqrshl v8.4s, v26.4s, v20.4s\n"
+        "ldr    q26, [%[scratch_block_data], x22]\n"
+        "mov    x9, %[scratch_block_data]\n"
+        "ldr    %[scratch_block_data], [sp, #368]\n"  // 8-byte Folded Reload
+        "mov    v7.16b, v6.16b\n"
+        "mov    v6.16b, v5.16b\n"
+        "rev32  v5.8h, v23.8h\n"
+        ".word 0x4e8c97c9  // sdot   v9.4s, v30.16b, v12.16b\n"
+        "sqrdmulh        v23.4s, v31.4s, v21.4s\n"
+        "rev32  v3.8h, v25.8h\n"
+        "sqrdmulh        v25.4s, v9.4s, v21.4s\n"
+        "sqrshl v23.4s, v23.4s, v20.4s\n"
+        "sqrshl v31.4s, v24.4s, v20.4s\n"
+        "sqrshl v24.4s, v25.4s, v20.4s\n"
+        "sqxtn  v9.4h, v23.4s\n"
+        "rev32  v1.8h, v27.8h\n"
+        "sqxtn  v10.4h, v24.4s\n"
+        "ldr    q27, [x28, x22]\n"
+        "ldr    q25, [%[scratch_block_data], x22]\n"
+        "ldr    q24, [x11, x22]\n"
+        "ldr    q23, [x10, x22]\n"
+        "sqxtn2 v9.8h, v31.4s\n"
+        "sqxtn2 v10.8h, v8.4s\n"
+        "sqadd  v31.8h, v9.8h, v0.8h\n"
+        "sqadd  v8.8h, v10.8h, v0.8h\n"
+        "sqxtn  v31.8b, v31.8h\n"
+        "sqxtn2 v31.16b, v8.8h\n"
+        "smax   v31.16b, v31.16b, v6.16b\n"
+        "smin   v31.16b, v31.16b, v7.16b\n"
+        "str    s31, [%[filter_workspace], x17]\n"
+        "st1    { v31.s }[1], [%[output_block_data]]\n"
+        "add    %[output_block_data], %[function_params], x17\n"
+        "st1    { v31.s }[2], [%[output_block_data]]\n"
+        "add    %[output_block_data], x13, x17\n"
+        "mov    v8.16b, v19.16b\n"
+        "st1    { v31.s }[3], [%[output_block_data]]\n"
+        "trn1   v31.8h, v15.8h, v22.8h\n"
+        "mov    v9.16b, v19.16b\n"
+        "mov    v10.16b, v19.16b\n"
+        "trn1   v1.8h, v1.8h, v27.8h\n"
+        "trn1   v2.8h, v2.8h, v26.8h\n"
+        ".word 0x4e9f9608  // sdot   v8.4s, v16.16b, v31.16b\n"
+        "mov    v11.16b, v19.16b\n"
+        "trn1   v3.8h, v3.8h, v25.8h\n"
+        ".word 0x4e819609  // sdot   v9.4s, v16.16b, v1.16b\n"
+        ".word 0x4e82960a  // sdot   v10.4s, v16.16b, v2.16b\n"
+        ".word 0x4e819628  // sdot   v8.4s, v17.16b, v1.16b\n"
+        "trn1   v4.8h, v4.8h, v24.8h\n"
+        ".word 0x4e83960b  // sdot   v11.4s, v16.16b, v3.16b\n"
+        ".word 0x4e829629  // sdot   v9.4s, v17.16b, v2.16b\n"
+        ".word 0x4e83962a  // sdot   v10.4s, v17.16b, v3.16b\n"
+        ".word 0x4e829648  // sdot   v8.4s, v18.16b, v2.16b\n"
+        "trn1   v5.8h, v5.8h, v23.8h\n"
+        ".word 0x4e84962b  // sdot   v11.4s, v17.16b, v4.16b\n"
+        ".word 0x4e839649  // sdot   v9.4s, v18.16b, v3.16b\n"
+        ".word 0x4e84964a  // sdot   v10.4s, v18.16b, v4.16b\n"
+        "sqrdmulh        v8.4s, v8.4s, v21.4s\n"
+        ".word 0x4e85964b  // sdot   v11.4s, v18.16b, v5.16b\n"
+        "sqrdmulh        v9.4s, v9.4s, v21.4s\n"
+        "sqrdmulh        v10.4s, v10.4s, v21.4s\n"
+        "sqrshl v8.4s, v8.4s, v20.4s\n"
+        "sqrdmulh        v11.4s, v11.4s, v21.4s\n"
+        "sqrshl v9.4s, v9.4s, v20.4s\n"
+        "sqrshl v10.4s, v10.4s, v20.4s\n"
+        "sqxtn  v8.4h, v8.4s\n"
+        "sqrshl v11.4s, v11.4s, v20.4s\n"
+        "sqxtn  v10.4h, v10.4s\n"
+        "sqxtn2 v8.8h, v9.4s\n"
+        "sqxtn2 v10.8h, v11.4s\n"
+        "sqadd  v8.8h, v8.8h, v0.8h\n"
+        "sqadd  v9.8h, v10.8h, v0.8h\n"
+        "sqxtn  v8.8b, v8.8h\n"
+        "sqxtn2 v8.16b, v9.8h\n"
+        "mov    v9.16b, v19.16b\n"
+        "ldr    %[scratch_block_data], [sp, #360]\n"  // 8-byte Folded Reload
+        "mov    v10.16b, v19.16b\n"
+        "mov    v11.16b, v19.16b\n"
+        ".word 0x4e9f9789  // sdot   v9.4s, v28.16b, v31.16b\n"
+        "mov    v12.16b, v19.16b\n"
+        ".word 0x4e81978a  // sdot   v10.4s, v28.16b, v1.16b\n"
+        ".word 0x4e82978b  // sdot   v11.4s, v28.16b, v2.16b\n"
+        ".word 0x4e8197a9  // sdot   v9.4s, v29.16b, v1.16b\n"
+        "smax   v8.16b, v8.16b, v6.16b\n"
+        ".word 0x4e83978c  // sdot   v12.4s, v28.16b, v3.16b\n"
+        ".word 0x4e8297aa  // sdot   v10.4s, v29.16b, v2.16b\n"
+        ".word 0x4e8397ab  // sdot   v11.4s, v29.16b, v3.16b\n"
+        ".word 0x4e8297c9  // sdot   v9.4s, v30.16b, v2.16b\n"
+        "add    %[output_block_data], x21, x17\n"
+        "smin   v8.16b, v8.16b, v7.16b\n"
+        ".word 0x4e8497ac  // sdot   v12.4s, v29.16b, v4.16b\n"
+        ".word 0x4e8397ca  // sdot   v10.4s, v30.16b, v3.16b\n"
+        ".word 0x4e8497cb  // sdot   v11.4s, v30.16b, v4.16b\n"
+        "sqrdmulh        v1.4s, v9.4s, v21.4s\n"
+        "str    s8, [%[scratch_block_data], x17]\n"
+        "st1    { v8.s }[1], [%[output_block_data]]\n"
+        "add    %[output_block_data], x20, x17\n"
+        ".word 0x4e8597cc  // sdot   v12.4s, v30.16b, v5.16b\n"
+        "sqrdmulh        v2.4s, v10.4s, v21.4s\n"
+        "sqrdmulh        v3.4s, v11.4s, v21.4s\n"
+        "sqrshl v1.4s, v1.4s, v20.4s\n"
+        "st1    { v8.s }[2], [%[output_block_data]]\n"
+        "add    %[output_block_data], x19, x17\n"
+        "sqrdmulh        v4.4s, v12.4s, v21.4s\n"
+        "sqrshl v2.4s, v2.4s, v20.4s\n"
+        "sqrshl v3.4s, v3.4s, v20.4s\n"
+        "sqxtn  v1.4h, v1.4s\n"
+        "st1    { v8.s }[3], [%[output_block_data]]\n"
+        "sqrshl v4.4s, v4.4s, v20.4s\n"
+        "sqxtn  v3.4h, v3.4s\n"
+        "sqxtn2 v1.8h, v2.4s\n"
+        "sqxtn2 v3.8h, v4.4s\n"
+        "sqadd  v1.8h, v1.8h, v0.8h\n"
+        "sqadd  v2.8h, v3.8h, v0.8h\n"
+        "sqxtn  v1.8b, v1.8h\n"
+        "mov    v5.16b, v6.16b\n"
+        "sqxtn2 v1.16b, v2.8h\n"
+        "smax   v1.16b, v1.16b, v5.16b\n"
+        "add    %[output_block_data], x15, x17\n"
+        "smin   v1.16b, v1.16b, v7.16b\n"
+        "str    s1, [x25, x17]\n"
+        "st1    { v1.s }[1], [%[output_block_data]]\n"
+        "add    %[output_block_data], x14, x17\n"
+        "mov    v31.16b, v19.16b\n"
+        "mov    v8.16b, v19.16b\n"
+        "mov    v9.16b, v19.16b\n"
+        "mov    v10.16b, v19.16b\n"
+        "mov    %[scratch_block_data], x9\n"
+        "mov    v6.16b, v7.16b\n"
+        "st1    { v1.s }[2], [%[output_block_data]]\n"
+        "add    %[output_block_data], x6, x17\n"
+        "subs   w12, w12, #1\n"  // =1
+        "add    x22, x22, #32\n"  // =32
+        ".word 0x4e9a965f  // sdot   v31.4s, v18.16b, v26.16b\n"
+        ".word 0x4e9a9628  // sdot   v8.4s, v17.16b, v26.16b\n"
+        ".word 0x4e9a9609  // sdot   v9.4s, v16.16b, v26.16b\n"
+        ".word 0x4e99960a  // sdot   v10.4s, v16.16b, v25.16b\n"
+        "add    x17, x17, x24\n"
+        "mov    v11.16b, v23.16b\n"
+        "mov    v12.16b, v24.16b\n"
+        "mov    v13.16b, v27.16b\n"
+        "mov    v14.16b, v22.16b\n"
+        "st1    { v1.s }[3], [%[output_block_data]]\n"
+        "b.ne   " DC_KERNEL_NO_MULT_9 "b\n"
+        // %bb.10:        // in Loop: Header=BB111_7 Depth=2
+        "ldr    x12, [sp, #376]\n"  // 8-byte Folded Reload
+        "ldp    d14, d7, [sp, #160]\n"  // 16-byte Folded Reload
+        "ldr    q15, [sp, #176]\n"  // 16-byte Folded Reload
+        "ldp    x24, x9, [sp, #280]\n"  // 16-byte Folded Reload
+        "add    %[output_block_data], x12, x22\n"
+        "ldr    x22, [sp, #200]\n"  // 8-byte Folded Reload
+        "ldr    x26, [sp, #272]\n"  // 8-byte Folded Reload
+        "add    x12, x23, x17\n"
+        "mov    w1, #4\n"
+        "ldr    w17, [sp, #348]\n"  // 4-byte Folded Reload
+        "cmp    w17, #0\n"  // =0
+        "b.gt   " DC_KERNEL_NO_MULT_12 "f\n"
+        "b      " DC_KERNEL_NO_MULT_6 "b\n"
+        DC_KERNEL_NO_MULT_11 ":\n"  // in Loop: Header=BB111_7 Depth=2
+        "ldr    x12, [sp, #112]\n"  // 8-byte Folded Reload
+        "add    x12, x12, x9, lsl #2\n"
+        "ldr    w17, [sp, #348]\n"  // 4-byte Folded Reload
+        "cmp    w17, #0\n"  // =0
+        "b.le   " DC_KERNEL_NO_MULT_6 "b\n"
+        DC_KERNEL_NO_MULT_12 ":\n"  // in Loop: Header=BB111_7 Depth=2
+        "ldr    w17, [sp, #348]\n"  // 4-byte Folded Reload
+        "movi   v28.16b, #0\n"
+        "movi   v29.16b, #0\n"
+        "movi   v30.16b, #0\n"
+        "cmp    w17, #3\n"  // =3
+        "movi   v11.16b, #0\n"
+        "movi   v12.16b, #0\n"
+        "movi   v13.16b, #0\n"
+        "b.lt   " DC_KERNEL_NO_MULT_14 "f\n"
+        // %bb.13:        // in Loop: Header=BB111_7 Depth=2
+        "add    x17, %[output_block_data], #32\n"  // =32
+        "ldp    x16, %[output_block_data], [sp, #320]\n"  // 16-byte Folded Reload
+        "ldr    q13, [x17]\n"
+        "ldr    %[scratch_block_data], [sp, #96]\n"  // 8-byte Folded Reload
+        "ldr    q12, [x17, %[output_block_data]]\n"
+        "ldr    %[output_block_data], [sp, #312]\n"  // 8-byte Folded Reload
+        "ldr    q11, [x17, x16]\n"
+        "ldr    q30, [x17, %[output_block_data]]\n"
+        "ldr    %[output_block_data], [sp, #304]\n"  // 8-byte Folded Reload
+        "ldr    q29, [x17, %[output_block_data]]\n"
+        "ldr    %[output_block_data], [sp, #296]\n"  // 8-byte Folded Reload
+        "ldr    q28, [x17, %[output_block_data]]\n"
+        DC_KERNEL_NO_MULT_14 ":\n"  // in Loop: Header=BB111_7 Depth=2
+        "ldr    w17, [sp, #348]\n"  // 4-byte Folded Reload
+        DC_KERNEL_NO_MULT_15 ":\n"  // Parent Loop BB111_4 Depth=1
+        // Parent Loop BB111_7 Depth=2
+        // =>  This Inner Loop Header: Depth=3
+        ".word 0x4e96961f  // sdot   v31.4s, v16.16b, v22.16b\n"
+        ".word 0x4e9b9608  // sdot   v8.4s, v16.16b, v27.16b\n"
+        ".word 0x4e999629  // sdot   v9.4s, v17.16b, v25.16b\n"
+        ".word 0x4e9b963f  // sdot   v31.4s, v17.16b, v27.16b\n"
+        ".word 0x4e98962a  // sdot   v10.4s, v17.16b, v24.16b\n"
+        ".word 0x4e999648  // sdot   v8.4s, v18.16b, v25.16b\n"
+        ".word 0x4e989649  // sdot   v9.4s, v18.16b, v24.16b\n"
+        "sqrdmulh        v1.4s, v31.4s, v21.4s\n"
+        ".word 0x4e97964a  // sdot   v10.4s, v18.16b, v23.16b\n"
+        "sqrdmulh        v2.4s, v8.4s, v21.4s\n"
+        "sqrdmulh        v3.4s, v9.4s, v21.4s\n"
+        "sqrshl v1.4s, v1.4s, v20.4s\n"
+        "sqrdmulh        v4.4s, v10.4s, v21.4s\n"
+        "sqrshl v2.4s, v2.4s, v20.4s\n"
+        "sqrshl v3.4s, v3.4s, v20.4s\n"
+        "sqxtn  v1.4h, v1.4s\n"
+        "sqrshl v4.4s, v4.4s, v20.4s\n"
+        "sqxtn  v3.4h, v3.4s\n"
+        "sqxtn2 v1.8h, v2.4s\n"
+        "sqxtn2 v3.8h, v4.4s\n"
+        "sqadd  v1.8h, v1.8h, v0.8h\n"
+        "sqadd  v2.8h, v3.8h, v0.8h\n"
+        "sqxtn  v1.8b, v1.8h\n"
+        "sqxtn2 v1.16b, v2.8h\n"
+        "smax   v1.16b, v1.16b, v5.16b\n"
+        "add    %[output_block_data], x12, x22\n"
+        "smin   v1.16b, v1.16b, v6.16b\n"
+        "ushr   v26.4s, v26.4s, #8\n"
+        "ushr   v25.4s, v25.4s, #8\n"
+        "str    s1, [x12]\n"
+        "st1    { v1.s }[1], [%[output_block_data]]\n"
+        "add    %[output_block_data], x12, x5\n"
+        "ushr   v22.4s, v22.4s, #8\n"
+        "ushr   v27.4s, v27.4s, #8\n"
+        "sli    v26.4s, v11.4s, #24\n"
+        "ushr   v24.4s, v24.4s, #8\n"
+        "ushr   v23.4s, v23.4s, #8\n"
+        "sli    v25.4s, v30.4s, #24\n"
+        "mov    v31.16b, v19.16b\n"
+        "mov    v8.16b, v19.16b\n"
+        "mov    v9.16b, v19.16b\n"
+        "mov    v10.16b, v19.16b\n"
+        "st1    { v1.s }[2], [%[output_block_data]]\n"
+        "add    %[output_block_data], x12, x8\n"
+        "subs   w17, w17, #1\n"  // =1
+        "sli    v22.4s, v13.4s, #24\n"
+        "ushr   v13.4s, v13.4s, #8\n"
+        "ushr   v11.4s, v11.4s, #8\n"
+        "sli    v27.4s, v12.4s, #24\n"
+        "ushr   v12.4s, v12.4s, #8\n"
+        "ushr   v30.4s, v30.4s, #8\n"
+        "sli    v24.4s, v29.4s, #24\n"
+        "ushr   v29.4s, v29.4s, #8\n"
+        "sli    v23.4s, v28.4s, #24\n"
+        "ushr   v28.4s, v28.4s, #8\n"
+        ".word 0x4e9a965f  // sdot   v31.4s, v18.16b, v26.16b\n"
+        ".word 0x4e9a9628  // sdot   v8.4s, v17.16b, v26.16b\n"
+        ".word 0x4e9a9609  // sdot   v9.4s, v16.16b, v26.16b\n"
+        "add    x12, x12, x7\n"
+        ".word 0x4e99960a  // sdot   v10.4s, v16.16b, v25.16b\n"
+        "st1    { v1.s }[3], [%[output_block_data]]\n"
+        "b.ne   " DC_KERNEL_NO_MULT_15 "b\n"
+        "b      " DC_KERNEL_NO_MULT_6 "b\n"
+        DC_KERNEL_NO_MULT_16 ":\n"  // in Loop: Header=BB111_4 Depth=1
+        "cmp    w17, #1\n"  // =1
+        "add    x9, %[bias_data], #32\n"  // =32
+        "b.lt   " DC_KERNEL_NO_MULT_2 "b\n"
+        // %bb.17:        // in Loop: Header=BB111_4 Depth=1
+        "ldr    w12, [sp, #340]\n"  // 4-byte Folded Reload
+        "cmp    w12, #1\n"  // =1
+        "b.lt   " DC_KERNEL_NO_MULT_27 "f\n"
+        // %bb.18:        // in Loop: Header=BB111_4 Depth=1
+        "ldr    x12, [sp, #88]\n"  // 8-byte Folded Reload
+        "ldp    x17, %[output_block_data], [sp, #32]\n"  // 16-byte Folded Reload
+        "str    x9, [sp, #288]\n"  // 8-byte Folded Spill
+        "ldp    q19, q20, [%[bias_data]]\n"
+        "lsl    w12, w12, #3\n"
+        "lsl    x12, x12, #2\n"
+        "add    x17, x17, x12\n"
+        "add    x12, %[output_block_data], x12\n"
+        "ldp    q21, q22, [x17]\n"
+        "ldp    q23, q24, [x12]\n"
+        "ldr    x9, [sp, #264]\n"  // 8-byte Folded Reload
+        "ldr    x27, [sp, #112]\n"  // 8-byte Folded Reload
+        "mov    w26, wzr\n"
+        "b      " DC_KERNEL_NO_MULT_20 "f\n"
+        DC_KERNEL_NO_MULT_19 ":\n"  // in Loop: Header=BB111_20 Depth=2
+        "ldr    w12, [sp, #108]\n"  // 4-byte Folded Reload
+        "ldr    x22, [sp, #200]\n"  // 8-byte Folded Reload
+        "add    w26, w26, #1\n"  // =1
+        "cmp    w26, w12\n"
+        "add    x27, x27, x22\n"
+        "b.eq   " DC_KERNEL_NO_MULT_26 "f\n"
+        DC_KERNEL_NO_MULT_20 ":\n"  // Parent Loop BB111_4 Depth=1
+        // =>  This Loop Header: Depth=2
+        // Child Loop BB111_22 Depth 3
+        // Child Loop BB111_25 Depth 4
+        "ldp    x16, %[output_block_data], [sp, #320]\n"  // 16-byte Folded Reload
+        "ldp    q25, q26, [x9]\n"
+        "mov    w12, wzr\n"
+        "mov    x17, x9\n"
+        "add    %[scratch_block_data], x9, %[output_block_data]\n"
+        "add    %[output_block_data], x9, x16\n"
+        "ldp    q27, q28, [%[scratch_block_data]]\n"
+        "ldp    q29, q30, [%[output_block_data]]\n"
+        "mov    x9, %[scratch_block_data]\n"
+        "mov    x22, x27\n"
+        "b      " DC_KERNEL_NO_MULT_22 "f\n"
+        DC_KERNEL_NO_MULT_21 ":\n"  // in Loop: Header=BB111_22 Depth=3
+        "ldr    w16, [sp, #340]\n"  // 4-byte Folded Reload
+        "add    w12, w12, #1\n"  // =1
+        "mov    x17, %[scratch_block_data]\n"
+        "cmp    w12, w16\n"
+        "b.eq   " DC_KERNEL_NO_MULT_19 "b\n"
+        DC_KERNEL_NO_MULT_22 ":\n"  // Parent Loop BB111_4 Depth=1
+        // Parent Loop BB111_20 Depth=2
+        // =>  This Loop Header: Depth=3
+        // Child Loop BB111_25 Depth 4
+        "ldr    w16, [sp, #344]\n"  // 4-byte Folded Reload
+        "add    %[scratch_block_data], x17, #32\n"  // =32
+        "cmp    w12, w16\n"
+        "ldr    w16, [sp, #348]\n"  // 4-byte Folded Reload
+        "csel   w3, w16, w1, eq\n"
+        "cmp    w3, #3\n"  // =3
+        "b.ge   " DC_KERNEL_NO_MULT_24 "f\n"
+        // %bb.23:        // in Loop: Header=BB111_22 Depth=3
+        "movi   v31.16b, #0\n"
+        "cmp    w3, #1\n"  // =1
+        "movi   v8.16b, #0\n"
+        "movi   v9.16b, #0\n"
+        "movi   v11.16b, #0\n"
+        "movi   v12.16b, #0\n"
+        "movi   v10.16b, #0\n"
+        "b.ge   " DC_KERNEL_NO_MULT_25 "f\n"
+        "b      " DC_KERNEL_NO_MULT_21 "b\n"
+        DC_KERNEL_NO_MULT_24 ":\n"  // in Loop: Header=BB111_22 Depth=3
+        "ldr    x24, [sp, #328]\n"  // 8-byte Folded Reload
+        "mov    x16, x11\n"
+        "mov    x11, x10\n"
+        "mov    x10, %[scratch_block_data]\n"
+        "add    x24, %[scratch_block_data], x24\n"
+        "ldr    %[scratch_block_data], [sp, #320]\n"  // 8-byte Folded Reload
+        "ldp    q10, q9, [x17, #32]\n"
+        "ldp    q12, q8, [x24]\n"
+        "mov    x23, x15\n"
+        "add    %[scratch_block_data], x10, x0\n"
+        "ldp    q11, q31, [%[scratch_block_data]]\n"
+        "mov    x15, x14\n"
+        "mov    x14, x6\n"
+        "mov    %[bias_data], x13\n"
+        "mov    x13, x21\n"
+        "mov    x21, x20\n"
+        "mov    x20, x19\n"
+        "mov    x19, x25\n"
+        "mov    x19, x20\n"
+        "mov    x20, x21\n"
+        "mov    x21, x13\n"
+        "mov    x13, %[bias_data]\n"
+        "mov    x14, x15\n"
+        "mov    x15, x23\n"
+        "mov    %[scratch_block_data], x10\n"
+        "mov    x10, x11\n"
+        "mov    x11, x16\n"
+        DC_KERNEL_NO_MULT_25 ":\n"  // Parent Loop BB111_4 Depth=1
+        // Parent Loop BB111_20 Depth=2
+        // Parent Loop BB111_22 Depth=3
+        // =>  This Inner Loop Header: Depth=4
+        "mov    v1.16b, v19.16b\n"
+        "mov    v2.16b, v20.16b\n"
+        ".word 0x4e999601  // sdot   v1.4s, v16.16b, v25.16b\n"
+        ".word 0x4e9a95e2  // sdot   v2.4s, v15.16b, v26.16b\n"
+        ".word 0x4e9b9621  // sdot   v1.4s, v17.16b, v27.16b\n"
+        ".word 0x4e9c9462  // sdot   v2.4s, v3.16b, v28.16b\n"
+        ".word 0x4e9d9641  // sdot   v1.4s, v18.16b, v29.16b\n"
+        ".word 0x4e9e9482  // sdot   v2.4s, v4.16b, v30.16b\n"
+        "sqrdmulh        v1.4s, v1.4s, v23.4s\n"
+        "sqrdmulh        v2.4s, v2.4s, v24.4s\n"
+        "sqrshl v1.4s, v1.4s, v21.4s\n"
+        "sqrshl v2.4s, v2.4s, v22.4s\n"
+        "sqxtn  v1.4h, v1.4s\n"
+        "sqxtn2 v1.8h, v2.4s\n"
+        "sqadd  v1.8h, v1.8h, v0.8h\n"
+        "sqxtn  v1.8b, v1.8h\n"
+        "smax   v1.8b, v1.8b, v7.8b\n"
+        "ushr   v25.4s, v25.4s, #8\n"
+        "ushr   v26.4s, v26.4s, #8\n"
+        "ushr   v27.4s, v27.4s, #8\n"
+        "ushr   v28.4s, v28.4s, #8\n"
+        "ushr   v29.4s, v29.4s, #8\n"
+        "ushr   v30.4s, v30.4s, #8\n"
+        "smin   v1.8b, v1.8b, v14.8b\n"
+        "subs   w3, w3, #1\n"  // =1
+        "sli    v25.4s, v10.4s, #24\n"
+        "ushr   v10.4s, v10.4s, #8\n"
+        "sli    v26.4s, v9.4s, #24\n"
+        "ushr   v9.4s, v9.4s, #8\n"
+        "sli    v27.4s, v12.4s, #24\n"
+        "ushr   v12.4s, v12.4s, #8\n"
+        "sli    v28.4s, v8.4s, #24\n"
+        "ushr   v8.4s, v8.4s, #8\n"
+        "sli    v29.4s, v11.4s, #24\n"
+        "ushr   v11.4s, v11.4s, #8\n"
+        "sli    v30.4s, v31.4s, #24\n"
+        "ushr   v31.4s, v31.4s, #8\n"
+        "str    d1, [x22]\n"
+        "add    x22, x22, x7\n"
+        "b.ne   " DC_KERNEL_NO_MULT_25 "b\n"
+        "b      " DC_KERNEL_NO_MULT_21 "b\n"
+        DC_KERNEL_NO_MULT_26 ":\n"  // in Loop: Header=BB111_4 Depth=1
+        "ldr    %[bias_data], [sp, #288]\n"  // 8-byte Folded Reload
+        "ldr    x23, [sp, #24]\n"  // 8-byte Folded Reload
+        "ldr    %[scratch_block_data], [sp, #96]\n"  // 8-byte Folded Reload
+        "b      " DC_KERNEL_NO_MULT_3 "b\n"
+        DC_KERNEL_NO_MULT_27 ":\n"  // in Loop: Header=BB111_4 Depth=1
+        "ldr    w12, [sp, #20]\n"  // 4-byte Folded Reload
+        "cmp    w17, #2\n"  // =2
+        "b.hs   " DC_KERNEL_NO_MULT_29 "f\n"
+        // %bb.28:        // in Loop: Header=BB111_4 Depth=1
+        "mov    w12, wzr\n"
+        "b      " DC_KERNEL_NO_MULT_31 "f\n"
+        DC_KERNEL_NO_MULT_29 ":\n"  // Parent Loop BB111_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "subs   w12, w12, #2\n"  // =2
+        "b.ne   " DC_KERNEL_NO_MULT_29 "b\n"
+        // %bb.30:        // in Loop: Header=BB111_4 Depth=1
+        "ldr    w12, [sp, #20]\n"  // 4-byte Folded Reload
+        "cmp    w17, w12\n"
+        "b.eq   " DC_KERNEL_NO_MULT_2 "b\n"
+        DC_KERNEL_NO_MULT_31 ":\n"  // in Loop: Header=BB111_4 Depth=1
+        "sub    w12, w17, w12\n"
+        DC_KERNEL_NO_MULT_32 ":\n"  // Parent Loop BB111_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "subs   w12, w12, #1\n"  // =1
+        "b.ne   " DC_KERNEL_NO_MULT_32 "b\n"
+        "b      " DC_KERNEL_NO_MULT_2 "b\n"
+        DC_KERNEL_NO_MULT_33 ":\n"
+        // Compiled intrinsics total stack 528, now 384 for spillage only.
+        "add    sp, sp, #384\n"  // =528
+        :
+        // Outputs.
+        [ scratch_block_data ] "+r"(scratch_block_data),
+        [ filter_workspace ] "+r"(filter_workspace),
+        [ bias_data ] "+r"(bias_data),
+        [ output_block_data ] "+r"(output_block_data)
+        :
+        // Inputs.
+        [ function_params ] "r"(function_params)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+        "v31",
+        // We use these general-purpose registers.
+        "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+        "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
+        "x27", "x28");
+#undef DC_KERNEL_NO_MULT_1
+#undef DC_KERNEL_NO_MULT_2
+#undef DC_KERNEL_NO_MULT_3
+#undef DC_KERNEL_NO_MULT_4
+#undef DC_KERNEL_NO_MULT_5
+#undef DC_KERNEL_NO_MULT_6
+#undef DC_KERNEL_NO_MULT_7
+#undef DC_KERNEL_NO_MULT_8
+#undef DC_KERNEL_NO_MULT_9
+#undef DC_KERNEL_NO_MULT_10
+#undef DC_KERNEL_NO_MULT_11
+#undef DC_KERNEL_NO_MULT_12
+#undef DC_KERNEL_NO_MULT_13
+#undef DC_KERNEL_NO_MULT_14
+#undef DC_KERNEL_NO_MULT_15
+#undef DC_KERNEL_NO_MULT_16
+#undef DC_KERNEL_NO_MULT_17
+#undef DC_KERNEL_NO_MULT_18
+#undef DC_KERNEL_NO_MULT_19
+#undef DC_KERNEL_NO_MULT_20
+#undef DC_KERNEL_NO_MULT_21
+#undef DC_KERNEL_NO_MULT_22
+#undef DC_KERNEL_NO_MULT_23
+#undef DC_KERNEL_NO_MULT_24
+#undef DC_KERNEL_NO_MULT_25
+#undef DC_KERNEL_NO_MULT_26
+#undef DC_KERNEL_NO_MULT_27
+#undef DC_KERNEL_NO_MULT_28
+#undef DC_KERNEL_NO_MULT_29
+#undef DC_KERNEL_NO_MULT_30
+#undef DC_KERNEL_NO_MULT_31
+#undef DC_KERNEL_NO_MULT_32
+#undef DC_KERNEL_NO_MULT_33
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         int8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
+                         output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                        QuantizationType::kPerChannelInt8,
+                        DepthwiseConvDepthMultiplication::kNoMultiplication,
+                        /*stride=*/2> {
+  static inline void KernelMacroBlockNeon(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, int8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    // Note that argument registers may be reused after parameter loading.
+    // x0 %[scratch_block_data]
+    // x1 %[filter_workspace]
+    // x2 %[bias_data]
+    // x3 %[output_block_data]
+    // x4 %[function_params]
+#define DC_KERNEL_NO_MULT_STRIDE_1 "1"
+#define DC_KERNEL_NO_MULT_STRIDE_2 "2"
+#define DC_KERNEL_NO_MULT_STRIDE_3 "3"
+#define DC_KERNEL_NO_MULT_STRIDE_4 "4"
+#define DC_KERNEL_NO_MULT_STRIDE_5 "5"
+#define DC_KERNEL_NO_MULT_STRIDE_6 "6"
+#define DC_KERNEL_NO_MULT_STRIDE_7 "7"
+#define DC_KERNEL_NO_MULT_STRIDE_8 "8"
+#define DC_KERNEL_NO_MULT_STRIDE_9 "9"
+#define DC_KERNEL_NO_MULT_STRIDE_10 "10"
+#define DC_KERNEL_NO_MULT_STRIDE_11 "11"
+#define DC_KERNEL_NO_MULT_STRIDE_12 "12"
+#define DC_KERNEL_NO_MULT_STRIDE_13 "13"
+#define DC_KERNEL_NO_MULT_STRIDE_14 "14"
+#define DC_KERNEL_NO_MULT_STRIDE_15 "15"
+#define DC_KERNEL_NO_MULT_STRIDE_16 "16"
+#define DC_KERNEL_NO_MULT_STRIDE_17 "17"
+#define DC_KERNEL_NO_MULT_STRIDE_18 "18"
+#define DC_KERNEL_NO_MULT_STRIDE_19 "19"
+#define DC_KERNEL_NO_MULT_STRIDE_20 "20"
+#define DC_KERNEL_NO_MULT_STRIDE_21 "21"
+#define DC_KERNEL_NO_MULT_STRIDE_22 "22"
+#define DC_KERNEL_NO_MULT_STRIDE_23 "23"
+#define DC_KERNEL_NO_MULT_STRIDE_24 "24"
+#define DC_KERNEL_NO_MULT_STRIDE_25 "25"
+#define DC_KERNEL_NO_MULT_STRIDE_26 "26"
+#define DC_KERNEL_NO_MULT_STRIDE_27 "27"
+#define DC_KERNEL_NO_MULT_STRIDE_28 "28"
+#define DC_KERNEL_NO_MULT_STRIDE_29 "29"
+#define DC_KERNEL_NO_MULT_STRIDE_30 "30"
+#define DC_KERNEL_NO_MULT_STRIDE_31 "31"
+#define DC_KERNEL_NO_MULT_STRIDE_32 "32"
+#define DC_KERNEL_NO_MULT_STRIDE_33 "33"
+#define DC_KERNEL_NO_MULT_STRIDE_34 "34"
+#define DC_KERNEL_NO_MULT_STRIDE_35 "35"
+
+    asm volatile(
+        // Compiled code used block of 176 for spill out of total stack of 320.
+        "sub    sp, sp, #176\n"  // =320
+
+
+        "ldr    w23, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
+        "str    %[scratch_block_data], [sp, #168]\n"  // 8-byte Folded Spill
+        "cmp    w23, #1\n"  // =1
+        "b.lt   " DC_KERNEL_NO_MULT_STRIDE_35 "f\n"
+        // %bb.1:
+        "ldr    x8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_MULTPLIPLIER_PER_CHANNEL) "]\n"
+        "ldpsw  x11, x12, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
+        "ldp    w13, w0, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
+        "ldr    w5, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
+        "str    x8, [sp, #144]\n"  // 8-byte Folded Spill
+        "ldr    x8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT_PER_CHANNEL) "]\n"
+        "ldr    x14, [%[function_params]]\n"
+        "str    w5, [sp, #164]\n"  // 4-byte Folded Spill
+        "add    x15, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "\n"  // =40
+        "str    x8, [sp, #136]\n"  // 8-byte Folded Spill
+        "add    x16, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "\n"  // =44
+        "add    x17, %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n"  // =28
+        "ldrsw  x8, [%[function_params], #" STR(DP_OFFSET_INPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
+        "ldp    w5, w4, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
+        "ld1r   { v0.8h }, [x17]\n"
+        "ld1r   { v1.8b }, [x15]\n"
+        "ld1r   { v2.8b }, [x16]\n"
+        "cmp    w5, #1\n"  // =1
+        "ccmp   w0, w13, #0, eq\n"
+        "lsl    w15, w14, #1\n"
+        "csel   w6, w0, w13, lt\n"
+        "lsl    x8, x8, #5\n"
+        "sxtw   x19, w14\n"
+        "sxtw   x22, w15\n"
+        "bic    w14, w6, w6, asr #31\n"
+        "str    x8, [sp, #152]\n"  // 8-byte Folded Spill
+        "lsl    x7, x12, #1\n"
+        "madd   x8, x22, x14, %[output_block_data]\n"
+        "mov    x9, xzr\n"
+        "mov    x10, xzr\n"
+        "lsl    x20, x12, #2\n"
+        "add    x21, x7, x12\n"
+        "sub    x14, x13, x14\n"
+        "stp    x8, x23, [sp, #48]\n"  // 16-byte Folded Spill
+        "add    x8, x8, #4\n"  // =4
+        "str    w4, [sp, #44]\n"  // 4-byte Folded Spill
+        "str    %[scratch_block_data], [sp, #32]\n"  // 8-byte Folded Spill
+        "str    x14, [sp, #128]\n"  // 8-byte Folded Spill
+        "str    x8, [sp, #8]\n"  // 8-byte Folded Spill
+        // implicit-def: $q5
+        // implicit-def: $q21
+        // implicit-def: $q19
+        // implicit-def: $q16
+        // implicit-def: $q20
+        // implicit-def: $q3
+        // implicit-def: $q11
+        // implicit-def: $q13
+        // implicit-def: $q14
+        // implicit-def: $q15
+        // implicit-def: $q6
+        "b      " DC_KERNEL_NO_MULT_STRIDE_4 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_2 ":\n"  // in Loop: Header=BB112_4 Depth=1
+        "add    x27, %[bias_data], #32\n"  // =32
+        "mov    v19.16b, v12.16b\n"
+        "mov    v3.16b, v9.16b\n"
+        "mov    v5.16b, v10.16b\n"
+        "mov    v20.16b, v7.16b\n"
+        DC_KERNEL_NO_MULT_STRIDE_3 ":\n"  // in Loop: Header=BB112_4 Depth=1
+        "add    x10, x10, #1\n"  // =1
+        "cmp    x10, x23\n"
+        "add    x9, x9, #8\n"  // =8
+        "mov    %[bias_data], x27\n"
+        "b.eq   " DC_KERNEL_NO_MULT_STRIDE_35 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_4 ":\n"  // =>This Loop Header: Depth=1
+        // Child Loop BB112_30 Depth 2
+        // Child Loop BB112_21 Depth 2
+        // Child Loop BB112_7 Depth 2
+        // Child Loop BB112_9 Depth 2
+        // Child Loop BB112_12 Depth 2
+        // Child Loop BB112_26 Depth 2
+        "ldr    w8, [sp, #164]\n"  // 4-byte Folded Reload
+        "add    w14, w10, w10, lsl #1\n"
+        "lsl    w14, w14, #5\n"
+        "add    x26, %[filter_workspace], x14\n"
+        "cmp    w8, #2\n"  // =2
+        "ldr    x8, [sp, #168]\n"  // 8-byte Folded Reload
+        "ldr    x14, [sp, #152]\n"  // 8-byte Folded Reload
+        "nop\n"
+        "madd   x28, x10, x14, x8\n"
+        "b.ne   " DC_KERNEL_NO_MULT_STRIDE_14 "f\n"
+        // %bb.5:        // in Loop: Header=BB112_4 Depth=1
+        "ldr    x8, [sp, #136]\n"  // 8-byte Folded Reload
+        "ubfx   x14, x9, #3, #29\n"
+        "lsl    w15, w10, #3\n"
+        "lsl    x27, x14, #3\n"
+        "lsl    x14, x15, #2\n"
+        "add    x24, x8, x14\n"
+        "ldr    x8, [sp, #144]\n"  // 8-byte Folded Reload
+        "ldr    q22, [x26]\n"
+        "ldr    q23, [x26, #32]\n"
+        "ldr    q24, [x26, #64]\n"
+        "add    x14, x8, x14\n"
+        "ldr    x8, [sp, #48]\n"  // 8-byte Folded Reload
+        "ldr    q25, [%[bias_data]]\n"
+        "ldr    q31, [x28]\n"
+        "ldr    q8, [x28, x12]\n"
+        "ldr    q30, [x28, x7]\n"
+        "ldr    q29, [x28, x21]\n"
+        "ldr    q26, [x24]\n"
+        "ldr    q27, [x14]\n"
+        "ldr    q28, [x28, x20]\n"
+        "add    x25, x8, x27\n"
+        "cmp    w6, #1\n"  // =1
+        "add    %[function_params], %[output_block_data], x15\n"
+        "mov    v12.16b, v19.16b\n"
+        "mov    v7.16b, v20.16b\n"
+        "b.lt   " DC_KERNEL_NO_MULT_STRIDE_23 "f\n"
+        // %bb.6:        // in Loop: Header=BB112_4 Depth=1
+        "mov    v4.16b, v21.16b\n"
+        "mov    x8, %[filter_workspace]\n"
+        "mov    w15, wzr\n"
+        "mov    x16, xzr\n"
+        "add    x17, x28, #32\n"  // =32
+        "mov    x23, x6\n"
+        "mov    v17.16b, v30.16b\n"
+        DC_KERNEL_NO_MULT_STRIDE_7 ":\n"  // Parent Loop BB112_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "mov    v18.16b, v25.16b\n"
+        "mov    v19.16b, v25.16b\n"
+        ".word 0x4e9f96d2  // sdot   v18.4s, v22.16b, v31.16b\n"
+        ".word 0x4e9196d3  // sdot   v19.4s, v22.16b, v17.16b\n"
+        ".word 0x4e8896f2  // sdot   v18.4s, v23.16b, v8.16b\n"
+        ".word 0x4e9d96f3  // sdot   v19.4s, v23.16b, v29.16b\n"
+        ".word 0x4e919712  // sdot   v18.4s, v24.16b, v17.16b\n"
+        ".word 0x4e9c9713  // sdot   v19.4s, v24.16b, v28.16b\n"
+        "sqrdmulh        v18.4s, v18.4s, v27.4s\n"
+        "and    %[scratch_block_data], x16, #0xffffffe0\n"
+        "sqrdmulh        v19.4s, v19.4s, v27.4s\n"
+        "sqrshl v18.4s, v18.4s, v26.4s\n"
+        "add    %[scratch_block_data], x17, x0\n"
+        "sqrshl v19.4s, v19.4s, v26.4s\n"
+        "sqxtn  v18.4h, v18.4s\n"
+        "rev32  v20.8h, v31.8h\n"
+        "rev32  v21.8h, v8.8h\n"
+        "rev32  v9.8h, v30.8h\n"
+        "rev32  v10.8h, v29.8h\n"
+        "ldr    q31, [%[scratch_block_data]]\n"
+        "ldr    q8, [%[scratch_block_data], x12]\n"
+        "ldr    q30, [%[scratch_block_data], x7]\n"
+        "ldr    q29, [%[scratch_block_data], x21]\n"
+        "rev32  v17.8h, v28.8h\n"
+        "ldr    q28, [%[scratch_block_data], x20]\n"
+        "sqxtn2 v18.8h, v19.4s\n"
+        "sqadd  v18.8h, v18.8h, v0.8h\n"
+        "sqxtn  v18.8b, v18.8h\n"
+        "add    %[filter_workspace], %[function_params], w15, sxtw\n"
+        "smax   v18.8b, v18.8b, v1.8b\n"
+        "add    %[scratch_block_data], %[filter_workspace], x11\n"
+        "smin   v18.8b, v18.8b, v2.8b\n"
+        "mov    v11.16b, v25.16b\n"
+        "str    s18, [%[filter_workspace]]\n"
+        "st1    { v18.s }[1], [%[scratch_block_data]]\n"
+        "trn1   v18.8h, v20.8h, v31.8h\n"
+        "mov    v19.16b, v25.16b\n"
+        "trn1   v20.8h, v21.8h, v8.8h\n"
+        "trn1   v21.8h, v9.8h, v30.8h\n"
+        ".word 0x4e9296cb  // sdot   v11.4s, v22.16b, v18.16b\n"
+        "trn1   v9.8h, v10.8h, v29.8h\n"
+        ".word 0x4e9596d3  // sdot   v19.4s, v22.16b, v21.16b\n"
+        ".word 0x4e9496eb  // sdot   v11.4s, v23.16b, v20.16b\n"
+        "trn1   v17.8h, v17.8h, v28.8h\n"
+        ".word 0x4e8996f3  // sdot   v19.4s, v23.16b, v9.16b\n"
+        ".word 0x4e95970b  // sdot   v11.4s, v24.16b, v21.16b\n"
+        ".word 0x4e919713  // sdot   v19.4s, v24.16b, v17.16b\n"
+        "sqrdmulh        v17.4s, v11.4s, v27.4s\n"
+        "sqrdmulh        v18.4s, v19.4s, v27.4s\n"
+        "sqrshl v17.4s, v17.4s, v26.4s\n"
+        "sqrshl v18.4s, v18.4s, v26.4s\n"
+        "sqxtn  v17.4h, v17.4s\n"
+        "sqxtn2 v17.8h, v18.4s\n"
+        "sqadd  v17.8h, v17.8h, v0.8h\n"
+        "sqxtn  v17.8b, v17.8h\n"
+        "add    %[filter_workspace], x1, x19\n"
+        "smax   v17.8b, v17.8b, v1.8b\n"
+        "add    %[scratch_block_data], %[filter_workspace], x11\n"
+        "smin   v17.8b, v17.8b, v2.8b\n"
+        "add    x16, x16, #32\n"  // =32
+        "subs   x23, x23, #1\n"  // =1
+        "str    s17, [%[filter_workspace]]\n"
+        "st1    { v17.s }[1], [%[scratch_block_data]]\n"
+        "add    w15, w15, w22\n"
+        "mov    v17.16b, v30.16b\n"
+        "b.ne   " DC_KERNEL_NO_MULT_STRIDE_7 "b\n"
+        // %bb.8:        // in Loop: Header=BB112_4 Depth=1
+        "mov    v6.16b, v31.16b\n"
+        "mov    v15.16b, v8.16b\n"
+        "mov    v14.16b, v30.16b\n"
+        "mov    v13.16b, v29.16b\n"
+        "mov    v11.16b, v28.16b\n"
+        "mov    w15, w6\n"
+        "mov    %[filter_workspace], x8\n"
+        "mov    v21.16b, v4.16b\n"
+        "cmp    w15, w13\n"
+        "ldr    x15, [sp, #128]\n"  // 8-byte Folded Reload
+        "b.ge   " DC_KERNEL_NO_MULT_STRIDE_10 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_9 ":\n"  // Parent Loop BB112_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "mov    v9.16b, v25.16b\n"
+        "mov    v10.16b, v25.16b\n"
+        ".word 0x4e9f96c9  // sdot   v9.4s, v22.16b, v31.16b\n"
+        ".word 0x4e8896e9  // sdot   v9.4s, v23.16b, v8.16b\n"
+        ".word 0x4e9e96ca  // sdot   v10.4s, v22.16b, v30.16b\n"
+        ".word 0x4e9e9709  // sdot   v9.4s, v24.16b, v30.16b\n"
+        ".word 0x4e9d96ea  // sdot   v10.4s, v23.16b, v29.16b\n"
+        ".word 0x4e9c970a  // sdot   v10.4s, v24.16b, v28.16b\n"
+        "sqrdmulh        v9.4s, v9.4s, v27.4s\n"
+        "sqrdmulh        v10.4s, v10.4s, v27.4s\n"
+        "sqrshl v9.4s, v9.4s, v26.4s\n"
+        "sqrshl v10.4s, v10.4s, v26.4s\n"
+        "sqxtn  v9.4h, v9.4s\n"
+        "sqxtn2 v9.8h, v10.4s\n"
+        "sqadd  v9.8h, v9.8h, v0.8h\n"
+        "sqxtn  v9.8b, v9.8h\n"
+        "smax   v9.8b, v9.8b, v1.8b\n"
+        "rev32  v31.8h, v31.8h\n"
+        "rev32  v8.8h, v8.8h\n"
+        "rev32  v30.8h, v30.8h\n"
+        "rev32  v29.8h, v29.8h\n"
+        "rev32  v28.8h, v28.8h\n"
+        "smin   v9.8b, v9.8b, v2.8b\n"
+        "add    x16, x25, x11\n"
+        "subs   x15, x15, #1\n"  // =1
+        "trn1   v31.8h, v31.8h, v6.8h\n"
+        "trn1   v8.8h, v8.8h, v15.8h\n"
+        "trn1   v29.8h, v29.8h, v13.8h\n"
+        "trn1   v30.8h, v30.8h, v14.8h\n"
+        "trn1   v28.8h, v28.8h, v11.8h\n"
+        "str    s9, [x25]\n"
+        "add    x25, x25, x22\n"
+        "st1    { v9.s }[1], [x16]\n"
+        "b.ne   " DC_KERNEL_NO_MULT_STRIDE_9 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_10 ":\n"  // in Loop: Header=BB112_4 Depth=1
+        "ldr    q22, [x26, #16]\n"
+        "ldr    q23, [x26, #48]\n"
+        "ldr    q24, [x26, #80]\n"
+        "ldr    q29, [x28, #16]!\n"
+        "ldr    q25, [%[bias_data], #16]\n"
+        "ldr    q26, [x24, #16]\n"
+        "ldr    q27, [x14, #16]\n"
+        "ldr    q8, [x28, x12]\n"
+        "ldr    q31, [x28, x7]\n"
+        "ldr    q30, [x28, x21]\n"
+        "ldr    q28, [x28, x20]\n"
+        "ldr    x23, [sp, #56]\n"  // 8-byte Folded Reload
+        "cmp    w6, #0\n"  // =0
+        "mov    v10.16b, v5.16b\n"
+        "b.le   " DC_KERNEL_NO_MULT_STRIDE_24 "f\n"
+        // %bb.11:        // in Loop: Header=BB112_4 Depth=1
+        "mov    v6.16b, v21.16b\n"
+        "mov    v9.16b, v3.16b\n"
+        "mov    w14, wzr\n"
+        "mov    x15, xzr\n"
+        "add    x16, x28, #32\n"  // =32
+        "add    x17, %[function_params], #4\n"  // =4
+        "mov    %[function_params], x6\n"
+        "mov    v17.16b, v31.16b\n"
+        DC_KERNEL_NO_MULT_STRIDE_12 ":\n"  // Parent Loop BB112_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "mov    v3.16b, v25.16b\n"
+        "mov    v4.16b, v25.16b\n"
+        ".word 0x4e9d96c3  // sdot   v3.4s, v22.16b, v29.16b\n"
+        ".word 0x4e9196c4  // sdot   v4.4s, v22.16b, v17.16b\n"
+        ".word 0x4e8896e3  // sdot   v3.4s, v23.16b, v8.16b\n"
+        ".word 0x4e9e96e4  // sdot   v4.4s, v23.16b, v30.16b\n"
+        ".word 0x4e919703  // sdot   v3.4s, v24.16b, v17.16b\n"
+        ".word 0x4e9c9704  // sdot   v4.4s, v24.16b, v28.16b\n"
+        "sqrdmulh        v3.4s, v3.4s, v27.4s\n"
+        "and    %[scratch_block_data], x15, #0xffffffe0\n"
+        "sqrdmulh        v4.4s, v4.4s, v27.4s\n"
+        "sqrshl v3.4s, v3.4s, v26.4s\n"
+        "add    %[scratch_block_data], x16, x0\n"
+        "sqrshl v4.4s, v4.4s, v26.4s\n"
+        "sqxtn  v3.4h, v3.4s\n"
+        "rev32  v5.8h, v29.8h\n"
+        "rev32  v18.8h, v8.8h\n"
+        "rev32  v19.8h, v31.8h\n"
+        "rev32  v20.8h, v30.8h\n"
+        "ldr    q29, [%[scratch_block_data]]\n"
+        "ldr    q8, [%[scratch_block_data], x12]\n"
+        "ldr    q31, [%[scratch_block_data], x7]\n"
+        "ldr    q30, [%[scratch_block_data], x21]\n"
+        "rev32  v17.8h, v28.8h\n"
+        "ldr    q28, [%[scratch_block_data], x20]\n"
+        "sqxtn2 v3.8h, v4.4s\n"
+        "sqadd  v3.8h, v3.8h, v0.8h\n"
+        "sqxtn  v3.8b, v3.8h\n"
+        "add    x8, x17, w14, sxtw\n"
+        "smax   v3.8b, v3.8b, v1.8b\n"
+        "add    %[scratch_block_data], x8, x11\n"
+        "smin   v3.8b, v3.8b, v2.8b\n"
+        "mov    v21.16b, v25.16b\n"
+        "str    s3, [x8]\n"
+        "st1    { v3.s }[1], [%[scratch_block_data]]\n"
+        "trn1   v3.8h, v5.8h, v29.8h\n"
+        "mov    v4.16b, v25.16b\n"
+        "trn1   v5.8h, v18.8h, v8.8h\n"
+        "trn1   v18.8h, v19.8h, v31.8h\n"
+        ".word 0x4e8396d5  // sdot   v21.4s, v22.16b, v3.16b\n"
+        "trn1   v19.8h, v20.8h, v30.8h\n"
+        ".word 0x4e9296c4  // sdot   v4.4s, v22.16b, v18.16b\n"
+        ".word 0x4e8596f5  // sdot   v21.4s, v23.16b, v5.16b\n"
+        "trn1   v17.8h, v17.8h, v28.8h\n"
+        ".word 0x4e9396e4  // sdot   v4.4s, v23.16b, v19.16b\n"
+        ".word 0x4e929715  // sdot   v21.4s, v24.16b, v18.16b\n"
+        ".word 0x4e919704  // sdot   v4.4s, v24.16b, v17.16b\n"
+        "sqrdmulh        v3.4s, v21.4s, v27.4s\n"
+        "sqrdmulh        v4.4s, v4.4s, v27.4s\n"
+        "sqrshl v3.4s, v3.4s, v26.4s\n"
+        "sqrshl v4.4s, v4.4s, v26.4s\n"
+        "sqxtn  v3.4h, v3.4s\n"
+        "sqxtn2 v3.8h, v4.4s\n"
+        "sqadd  v3.8h, v3.8h, v0.8h\n"
+        "sqxtn  v3.8b, v3.8h\n"
+        "add    x8, x8, x19\n"
+        "smax   v3.8b, v3.8b, v1.8b\n"
+        "add    x15, x15, #32\n"  // =32
+        "subs   %[function_params], %[function_params], #1\n"  // =1
+        "add    %[scratch_block_data], x8, x11\n"
+        "smin   v3.8b, v3.8b, v2.8b\n"
+        "add    w14, w14, w22\n"
+        "mov    v17.16b, v31.16b\n"
+        "str    s3, [x8]\n"
+        "st1    { v3.s }[1], [%[scratch_block_data]]\n"
+        "b.ne   " DC_KERNEL_NO_MULT_STRIDE_12 "b\n"
+        // %bb.13:        // in Loop: Header=BB112_4 Depth=1
+        "mov    v15.16b, v8.16b\n"
+        "mov    v14.16b, v31.16b\n"
+        "mov    v13.16b, v30.16b\n"
+        "mov    v11.16b, v28.16b\n"
+        "mov    w14, w6\n"
+        "mov    v21.16b, v6.16b\n"
+        "mov    v6.16b, v29.16b\n"
+        "mov    v3.16b, v29.16b\n"
+        "cmp    w14, w13\n"
+        "b.ge   " DC_KERNEL_NO_MULT_STRIDE_2 "b\n"
+        "b      " DC_KERNEL_NO_MULT_STRIDE_25 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_14 ":\n"  // in Loop: Header=BB112_4 Depth=1
+        "cmp    w13, #1\n"  // =1
+        "add    x27, %[bias_data], #32\n"  // =32
+        "b.lt   " DC_KERNEL_NO_MULT_STRIDE_3 "b\n"
+        // %bb.15:        // in Loop: Header=BB112_4 Depth=1
+        "ldr    x8, [sp, #136]\n"  // 8-byte Folded Reload
+        "lsl    w14, w10, #3\n"
+        "stp    q15, q14, [sp, #64]\n"  // 32-byte Folded Spill
+        "stp    q13, q11, [sp, #96]\n"  // 32-byte Folded Spill
+        "add    x15, x28, x12\n"
+        "lsl    x16, x14, #2\n"
+        "ldp    q10, q11, [x15]\n"
+        "add    x15, x8, x16\n"
+        "ldr    x8, [sp, #144]\n"  // 8-byte Folded Reload
+        "ldp    q30, q31, [x15]\n"
+        "add    x15, x28, x7\n"
+        "ldp    q22, q23, [x26]\n"
+        "add    x16, x8, x16\n"
+        "ldr    w8, [sp, #44]\n"  // 4-byte Folded Reload
+        "ldp    q24, q25, [x26, #32]\n"
+        "ldp    q26, q27, [x26, #64]\n"
+        "ldp    q17, q18, [%[bias_data]]\n"
+        "ldp    q14, q13, [x28], #32\n"
+        "ldp    q8, q9, [x16]\n"
+        "ldp    q12, q15, [x15]\n"
+        "add    %[bias_data], %[output_block_data], x14\n"
+        "cmp    w13, w8\n"
+        "b.ne   " DC_KERNEL_NO_MULT_STRIDE_27 "f\n"
+        // %bb.16:        // in Loop: Header=BB112_4 Depth=1
+        "ldr    x25, [sp, #32]\n"  // 8-byte Folded Reload
+        "mov    x14, xzr\n"
+        "mov    w4, wzr\n"
+        "mov    x24, x13\n"
+        "cbnz   x25,    " DC_KERNEL_NO_MULT_STRIDE_20 "f\n"
+        "b      " DC_KERNEL_NO_MULT_STRIDE_21 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_17 ":\n"  // in Loop: Header=BB112_21 Depth=2
+        "mov    v28.16b, v17.16b\n"
+        ".word 0x4e8e96dc  // sdot   v28.4s, v22.16b, v14.16b\n"
+        "mov    v29.16b, v18.16b\n"
+        ".word 0x4e8d96fd  // sdot   v29.4s, v23.16b, v13.16b\n"
+        ".word 0x4e8a971c  // sdot   v28.4s, v24.16b, v10.16b\n"
+        ".word 0x4e8b973d  // sdot   v29.4s, v25.16b, v11.16b\n"
+        ".word 0x4e8c975c  // sdot   v28.4s, v26.16b, v12.16b\n"
+        ".word 0x4e8f977d  // sdot   v29.4s, v27.16b, v15.16b\n"
+        "sqrdmulh        v28.4s, v28.4s, v8.4s\n"
+        "sqrdmulh        v29.4s, v29.4s, v9.4s\n"
+        "sqrshl v28.4s, v28.4s, v30.4s\n"
+        "sqrshl v29.4s, v29.4s, v31.4s\n"
+        "sqxtn  v28.4h, v28.4s\n"
+        "sqxtn2 v28.8h, v29.4s\n"
+        "sqadd  v28.8h, v28.8h, v0.8h\n"
+        "sqxtn  v28.8b, v28.8h\n"
+        "smax   v28.8b, v28.8b, v1.8b\n"
+        "smin   v28.8b, v28.8b, v2.8b\n"
+        "mov    v14.16b, v3.16b\n"
+        "mov    v10.16b, v20.16b\n"
+        "mov    v12.16b, v16.16b\n"
+        "mov    v13.16b, v19.16b\n"
+        "mov    v11.16b, v21.16b\n"
+        "mov    v15.16b, v5.16b\n"
+        "str    d28, [x15, x19]\n"
+        DC_KERNEL_NO_MULT_STRIDE_18 ":\n"  // in Loop: Header=BB112_21 Depth=2
+        "add    w4, w4, w22\n"
+        "add    x14, x14, #32\n"  // =32
+        "subs   x24, x24, #1\n"  // =1
+        "sub    x25, x25, #1\n"  // =1
+        "b.eq   " DC_KERNEL_NO_MULT_STRIDE_33 "f\n"
+        // %bb.19:        // in Loop: Header=BB112_21 Depth=2
+        "cbz    x25,    " DC_KERNEL_NO_MULT_STRIDE_21 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_20 ":\n"  // in Loop: Header=BB112_4 Depth=1
+        "and    x15, x14, #0xffffffe0\n"
+        "add    x15, x28, x15\n"
+        "add    x16, x15, x12\n"
+        "add    x17, x15, x7\n"
+        "ldp    q3, q19, [x15]\n"
+        "ldp    q20, q21, [x16]\n"
+        "ldp    q16, q5, [x17]\n"
+        DC_KERNEL_NO_MULT_STRIDE_21 ":\n"  // Parent Loop BB112_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "mov    v28.16b, v17.16b\n"
+        "mov    v29.16b, v18.16b\n"
+        ".word 0x4e8e96dc  // sdot   v28.4s, v22.16b, v14.16b\n"
+        ".word 0x4e8a971c  // sdot   v28.4s, v24.16b, v10.16b\n"
+        ".word 0x4e8d96fd  // sdot   v29.4s, v23.16b, v13.16b\n"
+        ".word 0x4e8c975c  // sdot   v28.4s, v26.16b, v12.16b\n"
+        ".word 0x4e8b973d  // sdot   v29.4s, v25.16b, v11.16b\n"
+        ".word 0x4e8f977d  // sdot   v29.4s, v27.16b, v15.16b\n"
+        "sqrdmulh        v28.4s, v28.4s, v8.4s\n"
+        "sqrdmulh        v29.4s, v29.4s, v9.4s\n"
+        "sqrshl v28.4s, v28.4s, v30.4s\n"
+        "sqrshl v29.4s, v29.4s, v31.4s\n"
+        "sqxtn  v28.4h, v28.4s\n"
+        "sqxtn2 v28.8h, v29.4s\n"
+        "sqadd  v28.8h, v28.8h, v0.8h\n"
+        "sqxtn  v28.8b, v28.8h\n"
+        "rev32  v14.8h, v14.8h\n"
+        "rev32  v10.8h, v10.8h\n"
+        "rev32  v12.8h, v12.8h\n"
+        "rev32  v13.8h, v13.8h\n"
+        "rev32  v11.8h, v11.8h\n"
+        "rev32  v15.8h, v15.8h\n"
+        "smax   v28.8b, v28.8b, v1.8b\n"
+        "add    x15, %[bias_data], w4, sxtw\n"
+        "cmp    w5, #1\n"  // =1
+        "trn1   v14.8h, v14.8h, v3.8h\n"
+        "trn1   v13.8h, v13.8h, v19.8h\n"
+        "trn1   v10.8h, v10.8h, v20.8h\n"
+        "trn1   v11.8h, v11.8h, v21.8h\n"
+        "trn1   v12.8h, v12.8h, v16.8h\n"
+        "smin   v28.8b, v28.8b, v2.8b\n"
+        "trn1   v15.8h, v15.8h, v5.8h\n"
+        "str    d28, [x15]\n"
+        "b.gt   " DC_KERNEL_NO_MULT_STRIDE_17 "b\n"
+        // %bb.22:        // in Loop: Header=BB112_21 Depth=2
+        "cbz    x25,    " DC_KERNEL_NO_MULT_STRIDE_18 "b\n"
+        "b      " DC_KERNEL_NO_MULT_STRIDE_17 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_23 ":\n"  // in Loop: Header=BB112_4 Depth=1
+        "mov    w15, wzr\n"
+        "cmp    w15, w13\n"
+        "ldr    x15, [sp, #128]\n"  // 8-byte Folded Reload
+        "b.lt   " DC_KERNEL_NO_MULT_STRIDE_9 "b\n"
+        "b      " DC_KERNEL_NO_MULT_STRIDE_10 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_24 ":\n"  // in Loop: Header=BB112_4 Depth=1
+        "mov    v9.16b, v3.16b\n"
+        "mov    w14, wzr\n"
+        "cmp    w14, w13\n"
+        "b.ge   " DC_KERNEL_NO_MULT_STRIDE_2 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_25 ":\n"  // in Loop: Header=BB112_4 Depth=1
+        "ldr    x8, [sp, #8]\n"  // 8-byte Folded Reload
+        "ldr    x15, [sp, #128]\n"  // 8-byte Folded Reload
+        "add    x14, x8, x27\n"
+        DC_KERNEL_NO_MULT_STRIDE_26 ":\n"  // Parent Loop BB112_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "mov    v3.16b, v25.16b\n"
+        "mov    v4.16b, v25.16b\n"
+        ".word 0x4e9d96c3  // sdot   v3.4s, v22.16b, v29.16b\n"
+        ".word 0x4e8896e3  // sdot   v3.4s, v23.16b, v8.16b\n"
+        ".word 0x4e9f96c4  // sdot   v4.4s, v22.16b, v31.16b\n"
+        ".word 0x4e9f9703  // sdot   v3.4s, v24.16b, v31.16b\n"
+        ".word 0x4e9e96e4  // sdot   v4.4s, v23.16b, v30.16b\n"
+        ".word 0x4e9c9704  // sdot   v4.4s, v24.16b, v28.16b\n"
+        "sqrdmulh        v3.4s, v3.4s, v27.4s\n"
+        "sqrdmulh        v4.4s, v4.4s, v27.4s\n"
+        "sqrshl v3.4s, v3.4s, v26.4s\n"
+        "sqrshl v4.4s, v4.4s, v26.4s\n"
+        "sqxtn  v3.4h, v3.4s\n"
+        "sqxtn2 v3.8h, v4.4s\n"
+        "sqadd  v3.8h, v3.8h, v0.8h\n"
+        "sqxtn  v3.8b, v3.8h\n"
+        "smax   v3.8b, v3.8b, v1.8b\n"
+        "rev32  v5.8h, v29.8h\n"
+        "rev32  v17.8h, v8.8h\n"
+        "rev32  v18.8h, v31.8h\n"
+        "rev32  v19.8h, v30.8h\n"
+        "rev32  v20.8h, v28.8h\n"
+        "smin   v3.8b, v3.8b, v2.8b\n"
+        "add    x16, x14, x11\n"
+        "subs   x15, x15, #1\n"  // =1
+        "trn1   v29.8h, v5.8h, v6.8h\n"
+        "trn1   v8.8h, v17.8h, v15.8h\n"
+        "trn1   v30.8h, v19.8h, v13.8h\n"
+        "trn1   v31.8h, v18.8h, v14.8h\n"
+        "trn1   v28.8h, v20.8h, v11.8h\n"
+        "str    s3, [x14]\n"
+        "add    x14, x14, x22\n"
+        "st1    { v3.s }[1], [x16]\n"
+        "b.ne   " DC_KERNEL_NO_MULT_STRIDE_26 "b\n"
+        "b      " DC_KERNEL_NO_MULT_STRIDE_2 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_27 ":\n"  // in Loop: Header=BB112_4 Depth=1
+        "ldr    x25, [sp, #32]\n"  // 8-byte Folded Reload
+        "mov    w14, wzr\n"
+        "mov    %[function_params], xzr\n"
+        "mov    x24, x13\n"
+        "str    q6, [sp, #16]\n"  // 16-byte Folded Spill
+        "b      " DC_KERNEL_NO_MULT_STRIDE_30 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_28 ":\n"  // in Loop: Header=BB112_30 Depth=2
+        "mov    v3.16b, v17.16b\n"
+        ".word 0x4e8e96c3  // sdot   v3.4s, v22.16b, v14.16b\n"
+        "mov    v4.16b, v18.16b\n"
+        ".word 0x4e8d96e4  // sdot   v4.4s, v23.16b, v13.16b\n"
+        ".word 0x4e8a9703  // sdot   v3.4s, v24.16b, v10.16b\n"
+        ".word 0x4e8b9724  // sdot   v4.4s, v25.16b, v11.16b\n"
+        ".word 0x4e8c9743  // sdot   v3.4s, v26.16b, v12.16b\n"
+        ".word 0x4e8f9764  // sdot   v4.4s, v27.16b, v15.16b\n"
+        "sqrdmulh        v3.4s, v3.4s, v8.4s\n"
+        "sqrdmulh        v4.4s, v4.4s, v9.4s\n"
+        "sqrshl v3.4s, v3.4s, v30.4s\n"
+        "sqrshl v4.4s, v4.4s, v31.4s\n"
+        "sqxtn  v3.4h, v3.4s\n"
+        "sqxtn2 v3.8h, v4.4s\n"
+        "sqadd  v3.8h, v3.8h, v0.8h\n"
+        "sqxtn  v3.8b, v3.8h\n"
+        "smax   v3.8b, v3.8b, v1.8b\n"
+        "smin   v3.8b, v3.8b, v2.8b\n"
+        "str    d3, [x15, x19]\n"
+        "mov    v3.16b, v6.16b\n"
+        "mov    v14.16b, v6.16b\n"
+        "mov    v10.16b, v20.16b\n"
+        "mov    v12.16b, v16.16b\n"
+        "mov    v13.16b, v19.16b\n"
+        "mov    v11.16b, v21.16b\n"
+        "mov    v15.16b, v5.16b\n"
+        DC_KERNEL_NO_MULT_STRIDE_29 ":\n"  // in Loop: Header=BB112_30 Depth=2
+        "add    %[function_params], %[function_params], #" STR(DP_OFFSET_OUTPUT_MULTIPLIER) "\n"  // =32
+        "sub    x25, x25, #1\n"  // =1
+        "subs   x24, x24, #1\n"  // =1
+        "add    w14, w14, w22\n"
+        "b.eq   " DC_KERNEL_NO_MULT_STRIDE_34 "f\n"
+        DC_KERNEL_NO_MULT_STRIDE_30 ":\n"  // Parent Loop BB112_4 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "mov    v28.16b, v17.16b\n"
+        "mov    v29.16b, v18.16b\n"
+        ".word 0x4e8e96dc  // sdot   v28.4s, v22.16b, v14.16b\n"
+        "and    x16, %[function_params], #0xffffffe0\n"
+        ".word 0x4e8d96fd  // sdot   v29.4s, v23.16b, v13.16b\n"
+        ".word 0x4e8a971c  // sdot   v28.4s, v24.16b, v10.16b\n"
+        "add    x16, x28, x16\n"
+        ".word 0x4e8b973d  // sdot   v29.4s, v25.16b, v11.16b\n"
+        ".word 0x4e8c975c  // sdot   v28.4s, v26.16b, v12.16b\n"
+        "rev32  v19.8h, v14.8h\n"
+        "rev32  v3.8h, v13.8h\n"
+        "ldp    q14, q13, [x16]\n"
+        ".word 0x4e8f977d  // sdot   v29.4s, v27.16b, v15.16b\n"
+        "sqrdmulh        v28.4s, v28.4s, v8.4s\n"
+        "sqrdmulh        v29.4s, v29.4s, v9.4s\n"
+        "sqrshl v28.4s, v28.4s, v30.4s\n"
+        "add    x17, x16, x12\n"
+        "add    x16, x16, x7\n"
+        "sqrshl v29.4s, v29.4s, v31.4s\n"
+        "sqxtn  v28.4h, v28.4s\n"
+        "rev32  v21.8h, v12.8h\n"
+        "rev32  v4.8h, v11.8h\n"
+        "ldp    q20, q11, [x17]\n"
+        "ldp    q12, q5, [x16]\n"
+        "sqxtn2 v28.8h, v29.4s\n"
+        "mov    v6.16b, v14.16b\n"
+        "trn1   v14.8h, v19.8h, v14.8h\n"
+        "mov    v19.16b, v13.16b\n"
+        "trn1   v13.8h, v3.8h, v13.8h\n"
+        "sqadd  v3.8h, v28.8h, v0.8h\n"
+        "sqxtn  v3.8b, v3.8h\n"
+        "rev32  v16.8h, v10.8h\n"
+        "rev32  v7.8h, v15.8h\n"
+        "smax   v3.8b, v3.8b, v1.8b\n"
+        "add    x15, %[bias_data], w14, sxtw\n"
+        "cmp    w5, #1\n"  // =1
+        "trn1   v10.8h, v16.8h, v20.8h\n"
+        "mov    v16.16b, v12.16b\n"
+        "trn1   v12.8h, v21.8h, v12.8h\n"
+        "mov    v21.16b, v11.16b\n"
+        "trn1   v11.8h, v4.8h, v11.8h\n"
+        "smin   v3.8b, v3.8b, v2.8b\n"
+        "trn1   v15.8h, v7.8h, v5.8h\n"
+        "str    d3, [x15]\n"
+        "b.gt   " DC_KERNEL_NO_MULT_STRIDE_28 "b\n"
+        // %bb.31:        // in Loop: Header=BB112_30 Depth=2
+        "cbnz   x25,    " DC_KERNEL_NO_MULT_STRIDE_28 "b\n"
+        // %bb.32:        // in Loop: Header=BB112_30 Depth=2
+        "mov    v3.16b, v6.16b\n"
+        "b      " DC_KERNEL_NO_MULT_STRIDE_29 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_33 ":\n"  // in Loop: Header=BB112_4 Depth=1
+        "ldp    q13, q11, [sp, #96]\n"  // 32-byte Folded Reload
+        "ldp    q15, q14, [sp, #64]\n"  // 32-byte Folded Reload
+        "b      " DC_KERNEL_NO_MULT_STRIDE_3 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_34 ":\n"  // in Loop: Header=BB112_4 Depth=1
+        "ldp    q13, q11, [sp, #96]\n"  // 32-byte Folded Reload
+        "ldp    q15, q14, [sp, #64]\n"  // 32-byte Folded Reload
+        "ldr    q6, [sp, #16]\n"  // 16-byte Folded Reload
+        "b      " DC_KERNEL_NO_MULT_STRIDE_3 "b\n"
+        DC_KERNEL_NO_MULT_STRIDE_35 ":\n"
+
+        // Compiled intrinsics total stack 320, now 176 for spillage only.
+        "add    sp, sp, #176\n"  // =320
+        :
+        // Outputs.
+        [ scratch_block_data ] "+r"(scratch_block_data),
+        [ filter_workspace ] "+r"(filter_workspace),
+        [ bias_data ] "+r"(bias_data),
+        [ output_block_data ] "+r"(output_block_data)
+        :
+        // Inputs.
+        [ function_params ] "r"(function_params)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+        "v31",
+        // We use these general-purpose registers.
+        "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+        "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
+        "x27", "x28");
+
+#undef DC_KERNEL_NO_MULT_STRIDE_1
+#undef DC_KERNEL_NO_MULT_STRIDE_2
+#undef DC_KERNEL_NO_MULT_STRIDE_3
+#undef DC_KERNEL_NO_MULT_STRIDE_4
+#undef DC_KERNEL_NO_MULT_STRIDE_5
+#undef DC_KERNEL_NO_MULT_STRIDE_6
+#undef DC_KERNEL_NO_MULT_STRIDE_7
+#undef DC_KERNEL_NO_MULT_STRIDE_8
+#undef DC_KERNEL_NO_MULT_STRIDE_9
+#undef DC_KERNEL_NO_MULT_STRIDE_10
+#undef DC_KERNEL_NO_MULT_STRIDE_11
+#undef DC_KERNEL_NO_MULT_STRIDE_12
+#undef DC_KERNEL_NO_MULT_STRIDE_13
+#undef DC_KERNEL_NO_MULT_STRIDE_14
+#undef DC_KERNEL_NO_MULT_STRIDE_15
+#undef DC_KERNEL_NO_MULT_STRIDE_16
+#undef DC_KERNEL_NO_MULT_STRIDE_17
+#undef DC_KERNEL_NO_MULT_STRIDE_18
+#undef DC_KERNEL_NO_MULT_STRIDE_19
+#undef DC_KERNEL_NO_MULT_STRIDE_20
+#undef DC_KERNEL_NO_MULT_STRIDE_21
+#undef DC_KERNEL_NO_MULT_STRIDE_22
+#undef DC_KERNEL_NO_MULT_STRIDE_23
+#undef DC_KERNEL_NO_MULT_STRIDE_24
+#undef DC_KERNEL_NO_MULT_STRIDE_25
+#undef DC_KERNEL_NO_MULT_STRIDE_26
+#undef DC_KERNEL_NO_MULT_STRIDE_27
+#undef DC_KERNEL_NO_MULT_STRIDE_28
+#undef DC_KERNEL_NO_MULT_STRIDE_29
+#undef DC_KERNEL_NO_MULT_STRIDE_30
+#undef DC_KERNEL_NO_MULT_STRIDE_31
+#undef DC_KERNEL_NO_MULT_STRIDE_32
+#undef DC_KERNEL_NO_MULT_STRIDE_33
+#undef DC_KERNEL_NO_MULT_STRIDE_34
+#undef DC_KERNEL_NO_MULT_STRIDE_35
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         int8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
+                         output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                        QuantizationType::kPerChannelInt8,
+                        DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                        /*stride=*/1> {
+  static inline void KernelMacroBlockNeon(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, int8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    // Note that argument registers may be reused after parameter loading.
+    // x0 %[scratch_block_data]
+    // x1 %[filter_workspace]
+    // x2 %[bias_data]
+    // x3 %[output_block_data]
+    // x4 %[function_params]
+#define DC_KERNEL_MULT_1 "1"
+#define DC_KERNEL_MULT_2 "2"
+#define DC_KERNEL_MULT_3 "3"
+#define DC_KERNEL_MULT_4 "4"
+#define DC_KERNEL_MULT_5 "5"
+#define DC_KERNEL_MULT_6 "6"
+#define DC_KERNEL_MULT_7 "7"
+#define DC_KERNEL_MULT_8 "8"
+#define DC_KERNEL_MULT_9 "9"
+#define DC_KERNEL_MULT_10 "10"
+#define DC_KERNEL_MULT_11 "11"
+#define DC_KERNEL_MULT_12 "12"
+#define DC_KERNEL_MULT_13 "13"
+#define DC_KERNEL_MULT_14 "14"
+#define DC_KERNEL_MULT_15 "15"
+#define DC_KERNEL_MULT_16 "16"
+#define DC_KERNEL_MULT_17 "17"
+#define DC_KERNEL_MULT_18 "18"
+#define DC_KERNEL_MULT_19 "19"
+#define DC_KERNEL_MULT_20 "20"
+#define DC_KERNEL_MULT_21 "21"
+#define DC_KERNEL_MULT_22 "22"
+#define DC_KERNEL_MULT_23 "23"
+
+    asm volatile(
+        // Compiled code used block of 336 for spill out of total stack of 448.
+        // However, an 8-byte spill was sneaked in to #344.
+        // Spillage increased to 352 and these are mapped to #336.
+        "sub    sp, sp, #352\n"  // =448
+
+
+        "ldr    w8, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
+        "str    %[filter_workspace], [sp, #56]\n"  // 8-byte Folded Spill
+        "cmp    w8, #1\n"  // =1
+        "str    x8, [sp, #32]\n"  // 8-byte Folded Spill
+        "b.lt   " DC_KERNEL_MULT_23 "f\n"
+        // %bb.1:
+        "ldr    w11, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
+        "ldr    x12, [%[function_params], #" STR(DP_OFFSET_OUTPUT_MULTPLIPLIER_PER_CHANNEL) "]\n"
+        "ldp    w17, w15, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
+        "ldr    w16, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
+        "ldpsw  x21, x6, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
+        "ldrb   w8, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "]\n"
+        "ldrb   w9, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "]\n"
+        "add    x10, %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n"  // =28
+        "str    x12, [sp, #24]\n"  // 8-byte Folded Spill
+        "ldr    x12, [%[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT_PER_CHANNEL) "]\n"
+        "ldrsw  %[function_params], [%[function_params], #" STR(DP_OFFSET_OUTPUT_DEPTH) "]\n"
+        "cmp    w11, #4\n"  // =4
+        "ccmp   w15, w17, #0, lt\n"
+        "csel   w25, w15, w17, lt\n"
+        "cmp    w16, #1\n"  // =1
+        "str    x16, [sp, #80]\n"  // 8-byte Folded Spill
+        "cset   w16, lt\n"
+        "cmp    w17, #1\n"  // =1
+        "dup    v1.16b, w8\n"
+        "fmov   s3, w8\n"
+        "dup    v2.16b, w9\n"
+        "fmov   s4, w9\n"
+        "lsl    x8, %[function_params], #1\n"
+        "add    x9, x21, %[function_params]\n"
+        "str    w17, [sp, #324]\n"  // 4-byte Folded Spill
+        "cset   w17, lt\n"
+        "ld1r   { v0.8h }, [x10]\n"
+        "lsl    x7, x21, #1\n"
+        "add    x22, x21, x21, lsl #1\n"
+        "add    x10, x8, %[function_params]\n"
+        "add    x9, %[output_block_data], x9\n"
+        "orr    w16, w16, w17\n"
+        "str    x9, [sp, #216]\n"  // 8-byte Folded Spill
+        "str    w15, [sp, #316]\n"  // 4-byte Folded Spill
+        "add    x9, x10, x22\n"
+        "add    x15, x10, x7\n"
+        "str    w16, [sp, #12]\n"  // 4-byte Folded Spill
+        "add    x16, x10, x21\n"
+        "add    x10, %[output_block_data], x10\n"
+        "str    x10, [sp, #200]\n"  // 8-byte Folded Spill
+        "add    x10, x6, #4\n"  // =4
+        "str    x10, [sp, #160]\n"  // 8-byte Folded Spill
+        "lsl    x10, %[function_params], #2\n"
+        "str    x10, [sp, #152]\n"  // 8-byte Folded Spill
+        "add    x10, %[output_block_data], x21\n"
+        "add    x17, x6, x6, lsl #2\n"
+        "str    x10, [sp, #144]\n"  // 8-byte Folded Spill
+        "add    x10, %[output_block_data], %[function_params]\n"
+        "lsl    x24, x6, #2\n"
+        "str    x10, [sp, #136]\n"  // 8-byte Folded Spill
+        "add    x10, x17, #4\n"  // =4
+        "add    x19, x6, x6, lsl #1\n"
+        "str    x10, [sp, #128]\n"  // 8-byte Folded Spill
+        "add    x10, x24, #4\n"  // =4
+        "str    x12, [sp, #16]\n"  // 8-byte Folded Spill
+        "str    w11, [sp, #320]\n"  // 4-byte Folded Spill
+        "lsl    x20, x6, #1\n"
+        "add    x11, x8, x22\n"
+        "add    x12, x8, x7\n"
+        "add    x13, x8, x21\n"
+        "add    x8, %[output_block_data], x8\n"
+        "str    x10, [sp, #120]\n"  // 8-byte Folded Spill
+        "add    x10, x19, #4\n"  // =4
+        "stp    x8, x7, [sp, #224]\n"  // 16-byte Folded Spill
+        "add    x8, x22, %[function_params]\n"
+        "str    x10, [sp, #112]\n"  // 8-byte Folded Spill
+        "add    x10, x20, #4\n"  // =4
+        "mov    x5, xzr\n"
+        "add    x14, x7, %[function_params]\n"
+        "add    x8, %[output_block_data], x8\n"
+        "str    x10, [sp, #104]\n"  // 8-byte Folded Spill
+        "add    x10, %[output_block_data], x7\n"
+        "add    x26, %[output_block_data], x11\n"
+        "str    x8, [sp, #184]\n"  // 8-byte Folded Spill
+        "add    x8, %[output_block_data], x14\n"
+        "mov    x14, x5\n"
+        "add    x5, %[output_block_data], x9\n"
+        "add    x9, %[output_block_data], x16\n"
+        "mov    x16, x22\n"
+        "stp    x19, x6, [sp, #296]\n"  // 16-byte Folded Spill
+        "mov    x11, x7\n"
+        "str    x20, [sp, #328]\n"  // 8-byte Folded Spill
+        "str    x10, [sp, #96]\n"  // 8-byte Folded Spill
+        "add    x10, %[output_block_data], x22\n"
+        "stp    x22, %[output_block_data], [sp, #64]\n"  // 16-byte Folded Spill
+        "ldr    x7, [sp, #160]\n"  // 8-byte Folded Reload
+        "ldr    x23, [sp, #136]\n"  // 8-byte Folded Reload
+        "ldp    x22, x19, [sp, #112]\n"  // 16-byte Folded Reload
+        "ldr    x20, [sp, #104]\n"  // 8-byte Folded Reload
+        "mov    %[filter_workspace], xzr\n"
+        "dup    v3.8b, v3.b[0]\n"
+        "dup    v4.8b, v4.b[0]\n"
+        "add    x27, %[output_block_data], x12\n"
+        "add    x28, %[output_block_data], x13\n"
+        "mov    x13, %[filter_workspace]\n"
+        "stp    x8, x17, [sp, #168]\n"  // 16-byte Folded Spill
+        "add    x8, %[output_block_data], x15\n"
+        "str    x10, [sp, #88]\n"  // 8-byte Folded Spill
+        "mov    w10, #4\n"
+        "stp    x21, %[scratch_block_data], [sp, #256]\n"  // 16-byte Folded Spill
+        "str    w25, [sp, #212]\n"  // 4-byte Folded Spill
+        "str    x24, [sp, #192]\n"  // 8-byte Folded Spill
+        "str    x9, [sp, #336]\n"  // 8-byte Folded Spill
+        "b      " DC_KERNEL_MULT_5 "f\n"
+        DC_KERNEL_MULT_2 ":\n"  // in Loop: Header=BB107_5 Depth=1
+        "mov    %[output_block_data], x21\n"
+        "ldp    x21, %[scratch_block_data], [sp, #256]\n"  // 16-byte Folded Reload
+        DC_KERNEL_MULT_3 ":\n"  // in Loop: Header=BB107_5 Depth=1
+        "mov    %[bias_data], x11\n"
+        DC_KERNEL_MULT_4 ":\n"  // in Loop: Header=BB107_5 Depth=1
+        "ldp    x12, x14, [sp, #32]\n"  // 16-byte Folded Reload
+        "ldr    x11, [sp, #72]\n"  // 8-byte Folded Reload
+        "ldr    x13, [sp, #48]\n"  // 8-byte Folded Reload
+        "add    x14, x14, #1\n"  // =1
+        "add    x11, x11, #8\n"  // =8
+        "cmp    x14, x12\n"
+        "add    x13, x13, #8\n"  // =8
+        "str    x11, [sp, #72]\n"  // 8-byte Folded Spill
+        "b.eq   " DC_KERNEL_MULT_23 "f\n"
+        DC_KERNEL_MULT_5 ":\n"  // =>This Loop Header: Depth=1
+        // Child Loop BB107_19 Depth 2
+        // Child Loop BB107_21 Depth 3
+        // Child Loop BB107_22 Depth 4
+        // Child Loop BB107_8 Depth 2
+        // Child Loop BB107_10 Depth 3
+        // Child Loop BB107_14 Depth 3
+        "ldr    x12, [sp, #56]\n"  // 8-byte Folded Reload
+        "ldr    x16, [sp, #80]\n"  // 8-byte Folded Reload
+        "ldp    q18, q5, [x12]\n"
+        "ldp    q17, q6, [x12, #32]\n"
+        "ldp    q16, q7, [x12, #64]\n"
+        "cmp    w16, #4\n"  // =4
+        "add    x12, x12, #96\n"  // =96
+        "stp    x13, x12, [sp, #48]\n"  // 16-byte Folded Spill
+        "str    x14, [sp, #40]\n"  // 8-byte Folded Spill
+        "b.ne   " DC_KERNEL_MULT_16 "f\n"
+        // %bb.6:        // in Loop: Header=BB107_5 Depth=1
+        "lsl    w12, w14, #3\n"
+        "ldr    x14, [sp, #16]\n"  // 8-byte Folded Reload
+        "lsl    x12, x12, #2\n"
+        "mov    x15, xzr\n"
+        "mov    %[filter_workspace], x13\n"
+        "add    x11, x14, x12\n"
+        "ldr    x14, [sp, #24]\n"  // 8-byte Folded Reload
+        "str    x11, [sp, #248]\n"  // 8-byte Folded Spill
+        "add    x11, x14, x12\n"
+        "str    x11, [sp, #240]\n"  // 8-byte Folded Spill
+        "b      " DC_KERNEL_MULT_8 "f\n"
+        DC_KERNEL_MULT_7 ":\n"  // in Loop: Header=BB107_8 Depth=2
+        "add    x15, x15, #1\n"  // =1
+        "cmp    x15, #2\n"  // =2
+        "add    %[filter_workspace], x1, #4\n"  // =4
+        "mov    v16.16b, v7.16b\n"
+        "mov    v17.16b, v6.16b\n"
+        "mov    v18.16b, v5.16b\n"
+        "b.eq   " DC_KERNEL_MULT_4 "b\n"
+        DC_KERNEL_MULT_8 ":\n"  // Parent Loop BB107_5 Depth=1
+        // =>  This Loop Header: Depth=2
+        // Child Loop BB107_10 Depth 3
+        // Child Loop BB107_14 Depth 3
+        "ldr    q19, [%[bias_data]], #16\n"
+        "ldr    x11, [sp, #248]\n"  // 8-byte Folded Reload
+        "lsl    x12, x15, #4\n"
+        "ldr    w13, [%[scratch_block_data]]\n"
+        "ldr    x16, [sp, #328]\n"  // 8-byte Folded Reload
+        "ldr    q20, [x11, x12]\n"
+        "ldr    x11, [sp, #240]\n"  // 8-byte Folded Reload
+        "ldr    w6, [%[scratch_block_data], x24]\n"
+        "ldr    w16, [%[scratch_block_data], x16]\n"
+        "ldr    q21, [x11, x12]\n"
+        "ldp    x12, x14, [sp, #296]\n"  // 16-byte Folded Reload
+        "fmov   s22, w13\n"
+        "add    x14, %[scratch_block_data], x14\n"
+        "mov    v22.s[1], w13\n"
+        "fmov   s23, w6\n"
+        "ldr    w12, [%[scratch_block_data], x12]\n"
+        "ld1    { v22.s }[2], [x14]\n"
+        "add    x14, %[scratch_block_data], x17\n"
+        "mov    v23.s[1], w6\n"
+        "ld1    { v23.s }[2], [x14]\n"
+        "fmov   s24, w16\n"
+        "mov    v24.s[1], w16\n"
+        "dup    v25.4s, w16\n"
+        "mov    v28.16b, v19.16b\n"
+        "mov    v29.16b, v19.16b\n"
+        "mov    v30.16b, v19.16b\n"
+        "dup    v26.4s, w12\n"
+        "mov    v31.16b, v19.16b\n"
+        "mov    v24.s[2], w12\n"
+        "cmp    w25, #1\n"  // =1
+        ".word 0x4e99961c  // sdot   v28.4s, v16.16b, v25.16b\n"
+        ".word 0x4e99963d  // sdot   v29.4s, v17.16b, v25.16b\n"
+        ".word 0x4e99965e  // sdot   v30.4s, v18.16b, v25.16b\n"
+        "mov    v24.s[3], w16\n"
+        "mov    v22.s[3], w13\n"
+        "mov    v23.s[3], w6\n"
+        ".word 0x4e9a965f  // sdot   v31.4s, v18.16b, v26.16b\n"
+        "b.lt   " DC_KERNEL_MULT_15 "f\n"
+        // %bb.9:        // in Loop: Header=BB107_8 Depth=2
+        "stp    x15, %[bias_data], [sp, #280]\n"  // 16-byte Folded Spill
+        "mov    w13, w25\n"
+        "str    %[filter_workspace], [sp, #272]\n"  // 8-byte Folded Spill
+        "mov    x16, %[filter_workspace]\n"
+        "mov    x14, %[scratch_block_data]\n"
+        "ldp    x25, %[scratch_block_data], [sp, #216]\n"  // 16-byte Folded Reload
+        "mov    x24, x28\n"
+        "mov    x28, x27\n"
+        "ldr    x27, [sp, #200]\n"  // 8-byte Folded Reload
+        "ldr    x17, [sp, #184]\n"  // 8-byte Folded Reload
+        "mov    x9, x8\n"
+        "mov    x8, x5\n"
+        "ldr    x5, [sp, #168]\n"  // 8-byte Folded Reload
+        "ldp    x15, x10, [sp, #144]\n"  // 16-byte Folded Reload
+        "ldr    %[bias_data], [sp, #128]\n"  // 8-byte Folded Reload
+        "ldp    %[filter_workspace], x11, [sp, #88]\n"  // 16-byte Folded Reload
+        "shl    v25.4s, v18.4s, #8\n"
+        "shl    v26.4s, v17.4s, #8\n"
+        "shl    v27.4s, v16.4s, #8\n"
+        "mov    x21, %[output_block_data]\n"
+        DC_KERNEL_MULT_10 ":\n"  // Parent Loop BB107_5 Depth=1
+        // Parent Loop BB107_8 Depth=2
+        // =>  This Inner Loop Header: Depth=3
+        ".word 0x4f96e25c  // sdot   v28.4s, v18.16b, v22.4b[0]\n"
+        ".word 0x4f96ea5d  // sdot   v29.4s, v18.16b, v22.4b[2]\n"
+        ".word 0x4f98ea3e  // sdot   v30.4s, v17.16b, v24.4b[2]\n"
+        ".word 0x4f96ea3c  // sdot   v28.4s, v17.16b, v22.4b[2]\n"
+        ".word 0x4f97e23f  // sdot   v31.4s, v17.16b, v23.4b[0]\n"
+        ".word 0x4f98ea1d  // sdot   v29.4s, v16.16b, v24.4b[2]\n"
+        ".word 0x4f97e21e  // sdot   v30.4s, v16.16b, v23.4b[0]\n"
+        "sqrdmulh        v28.4s, v28.4s, v21.4s\n"
+        ".word 0x4f97ea1f  // sdot   v31.4s, v16.16b, v23.4b[2]\n"
+        "sqrdmulh        v29.4s, v29.4s, v21.4s\n"
+        "sqrdmulh        v30.4s, v30.4s, v21.4s\n"
+        "sqrshl v28.4s, v28.4s, v20.4s\n"
+        "sqrdmulh        v31.4s, v31.4s, v21.4s\n"
+        "sqrshl v29.4s, v29.4s, v20.4s\n"
+        "sqrshl v30.4s, v30.4s, v20.4s\n"
+        "sqxtn  v28.4h, v28.4s\n"
+        "sqrshl v31.4s, v31.4s, v20.4s\n"
+        "sqxtn  v30.4h, v30.4s\n"
+        "sqxtn2 v28.8h, v29.4s\n"
+        "sqxtn2 v30.8h, v31.4s\n"
+        "sqadd  v28.8h, v28.8h, v0.8h\n"
+        "sqadd  v29.8h, v30.8h, v0.8h\n"
+        "sqxtn  v28.8b, v28.8h\n"
+        "sqxtn2 v28.16b, v29.8h\n"
+        "smax   v28.16b, v28.16b, v1.16b\n"
+        "add    %[output_block_data], x15, x16\n"
+        "smin   v28.16b, v28.16b, v2.16b\n"
+        "add    x6, x11, x16\n"
+        "str    s28, [x21, x16]\n"
+        "st1    { v28.s }[1], [%[output_block_data]]\n"
+        "add    %[output_block_data], %[filter_workspace], x16\n"
+        "st1    { v28.s }[2], [x6]\n"
+        "st1    { v28.s }[3], [%[output_block_data]]\n"
+        "mov    x12, x14\n"
+        "add    x6, x14, x20\n"
+        "ldr    w3, [x14, #4]!\n"
+        "ld1    { v24.s }[1], [x6]\n"
+        "add    x6, x12, x19\n"
+        "ld1    { v23.s }[1], [x6]\n"
+        "mov    v22.s[1], w3\n"
+        "add    %[output_block_data], x12, x22\n"
+        "ld1    { v24.s }[3], [%[output_block_data]]\n"
+        "add    %[output_block_data], x12, x7\n"
+        "ld1    { v22.s }[3], [%[output_block_data]]\n"
+        "add    x12, x12, %[bias_data]\n"
+        "mov    v28.16b, v19.16b\n"
+        "ld1    { v23.s }[3], [x12]\n"
+        "mov    v29.16b, v19.16b\n"
+        "mov    v30.16b, v19.16b\n"
+        ".word 0x4f96e33c  // sdot   v28.4s, v25.16b, v22.4b[0]\n"
+        "mov    v31.16b, v19.16b\n"
+        ".word 0x4f98e33e  // sdot   v30.4s, v25.16b, v24.4b[0]\n"
+        ".word 0x4f96eb3d  // sdot   v29.4s, v25.16b, v22.4b[2]\n"
+        ".word 0x4f96eb5c  // sdot   v28.4s, v26.16b, v22.4b[2]\n"
+        ".word 0x4f98eb3f  // sdot   v31.4s, v25.16b, v24.4b[2]\n"
+        ".word 0x4f98eb5e  // sdot   v30.4s, v26.16b, v24.4b[2]\n"
+        ".word 0x4f98e35d  // sdot   v29.4s, v26.16b, v24.4b[0]\n"
+        ".word 0x4f98e37c  // sdot   v28.4s, v27.16b, v24.4b[0]\n"
+        ".word 0x4f97e35f  // sdot   v31.4s, v26.16b, v23.4b[0]\n"
+        ".word 0x4f97e37e  // sdot   v30.4s, v27.16b, v23.4b[0]\n"
+        ".word 0x4f98eb7d  // sdot   v29.4s, v27.16b, v24.4b[2]\n"
+        "sqrdmulh        v28.4s, v28.4s, v21.4s\n"
+        ".word 0x4f97eb7f  // sdot   v31.4s, v27.16b, v23.4b[2]\n"
+        "sqrdmulh        v30.4s, v30.4s, v21.4s\n"
+        "sqrdmulh        v29.4s, v29.4s, v21.4s\n"
+        "sqrshl v28.4s, v28.4s, v20.4s\n"
+        "sqrdmulh        v31.4s, v31.4s, v21.4s\n"
+        "sqrshl v30.4s, v30.4s, v20.4s\n"
+        "sqrshl v29.4s, v29.4s, v20.4s\n"
+        "sqxtn  v28.4h, v28.4s\n"
+        "sqrshl v31.4s, v31.4s, v20.4s\n"
+        "sqxtn  v30.4h, v30.4s\n"
+        "sqxtn2 v28.8h, v29.4s\n"
+        "sqxtn2 v30.8h, v31.4s\n"
+        "sqadd  v28.8h, v28.8h, v0.8h\n"
+        "sqadd  v29.8h, v30.8h, v0.8h\n"
+        "sqxtn  v28.8b, v28.8h\n"
+        "sqxtn2 v28.16b, v29.8h\n"
+        "smax   v28.16b, v28.16b, v1.16b\n"
+        "add    x12, x25, x16\n"
+        "smin   v28.16b, v28.16b, v2.16b\n"
+        "add    %[output_block_data], x5, x16\n"
+        "str    s28, [x23, x16]\n"
+        "st1    { v28.s }[1], [x12]\n"
+        "add    x12, x17, x16\n"
+        "mov    v29.16b, v19.16b\n"
+        "ushr   v10.2d, v22.2d, #16\n"
+        "mov    v30.16b, v19.16b\n"
+        "mov    v31.16b, v19.16b\n"
+        "st1    { v28.s }[2], [%[output_block_data]]\n"
+        "st1    { v28.s }[3], [x12]\n"
+        "ushr   v28.2d, v24.2d, #16\n"
+        ".word 0x4f8ae25d  // sdot   v29.4s, v18.16b, v10.4b[0]\n"
+        "mov    v8.16b, v19.16b\n"
+        ".word 0x4f9ce25f  // sdot   v31.4s, v18.16b, v28.4b[0]\n"
+        ".word 0x4f8aea5e  // sdot   v30.4s, v18.16b, v10.4b[2]\n"
+        ".word 0x4f8aea3d  // sdot   v29.4s, v17.16b, v10.4b[2]\n"
+        "ushr   v9.2d, v23.2d, #16\n"
+        ".word 0x4f9cea48  // sdot   v8.4s, v18.16b, v28.4b[2]\n"
+        ".word 0x4f9cea3f  // sdot   v31.4s, v17.16b, v28.4b[2]\n"
+        ".word 0x4f9ce23e  // sdot   v30.4s, v17.16b, v28.4b[0]\n"
+        ".word 0x4f9ce21d  // sdot   v29.4s, v16.16b, v28.4b[0]\n"
+        ".word 0x4f89e228  // sdot   v8.4s, v17.16b, v9.4b[0]\n"
+        ".word 0x4f89e21f  // sdot   v31.4s, v16.16b, v9.4b[0]\n"
+        ".word 0x4f9cea1e  // sdot   v30.4s, v16.16b, v28.4b[2]\n"
+        "sqrdmulh        v29.4s, v29.4s, v21.4s\n"
+        ".word 0x4f89ea08  // sdot   v8.4s, v16.16b, v9.4b[2]\n"
+        "sqrdmulh        v31.4s, v31.4s, v21.4s\n"
+        "sqrdmulh        v30.4s, v30.4s, v21.4s\n"
+        "sqrshl v29.4s, v29.4s, v20.4s\n"
+        "sqrdmulh        v8.4s, v8.4s, v21.4s\n"
+        "sqrshl v31.4s, v31.4s, v20.4s\n"
+        "sqrshl v30.4s, v30.4s, v20.4s\n"
+        "sqxtn  v29.4h, v29.4s\n"
+        "sqrshl v8.4s, v8.4s, v20.4s\n"
+        "sqxtn  v31.4h, v31.4s\n"
+        "sqxtn2 v29.8h, v30.4s\n"
+        "sqxtn2 v31.8h, v8.4s\n"
+        "sqadd  v29.8h, v29.8h, v0.8h\n"
+        "sqadd  v30.8h, v31.8h, v0.8h\n"
+        "sqxtn  v29.8b, v29.8h\n"
+        "sqxtn2 v29.16b, v30.8h\n"
+        "smax   v29.16b, v29.16b, v1.16b\n"
+        "add    %[output_block_data], x24, x16\n"
+        "smin   v29.16b, v29.16b, v2.16b\n"
+        "mov    v30.16b, v19.16b\n"
+        "add    x12, x28, x16\n"
+        "str    s29, [%[scratch_block_data], x16]\n"
+        "st1    { v29.s }[1], [%[output_block_data]]\n"
+        "add    %[output_block_data], x26, x16\n"
+        "mov    v31.16b, v19.16b\n"
+        "mov    v8.16b, v19.16b\n"
+        ".word 0x4f8ae33e  // sdot   v30.4s, v25.16b, v10.4b[0]\n"
+        "st1    { v29.s }[2], [x12]\n"
+        "st1    { v29.s }[3], [%[output_block_data]]\n"
+        "mov    v29.16b, v19.16b\n"
+        ".word 0x4f9ce328  // sdot   v8.4s, v25.16b, v28.4b[0]\n"
+        ".word 0x4f8aeb3f  // sdot   v31.4s, v25.16b, v10.4b[2]\n"
+        ".word 0x4f8aeb5e  // sdot   v30.4s, v26.16b, v10.4b[2]\n"
+        ".word 0x4f9ceb3d  // sdot   v29.4s, v25.16b, v28.4b[2]\n"
+        ".word 0x4f9ceb48  // sdot   v8.4s, v26.16b, v28.4b[2]\n"
+        ".word 0x4f9ce35f  // sdot   v31.4s, v26.16b, v28.4b[0]\n"
+        ".word 0x4f9ce37e  // sdot   v30.4s, v27.16b, v28.4b[0]\n"
+        ".word 0x4f89e35d  // sdot   v29.4s, v26.16b, v9.4b[0]\n"
+        ".word 0x4f89e368  // sdot   v8.4s, v27.16b, v9.4b[0]\n"
+        ".word 0x4f9ceb7f  // sdot   v31.4s, v27.16b, v28.4b[2]\n"
+        "sqrdmulh        v30.4s, v30.4s, v21.4s\n"
+        ".word 0x4f89eb7d  // sdot   v29.4s, v27.16b, v9.4b[2]\n"
+        "sqrdmulh        v28.4s, v8.4s, v21.4s\n"
+        "sqrdmulh        v31.4s, v31.4s, v21.4s\n"
+        "sqrshl v30.4s, v30.4s, v20.4s\n"
+        "sqrdmulh        v29.4s, v29.4s, v21.4s\n"
+        "sqrshl v28.4s, v28.4s, v20.4s\n"
+        "sqrshl v31.4s, v31.4s, v20.4s\n"
+        "sqxtn  v30.4h, v30.4s\n"
+        "ldr    x12, [sp, #336]\n"  // 8-byte Folded Reload
+        "sqrshl v29.4s, v29.4s, v20.4s\n"
+        "sqxtn  v28.4h, v28.4s\n"
+        "sqxtn2 v30.8h, v31.4s\n"
+        "sqxtn2 v28.8h, v29.4s\n"
+        "sqadd  v29.8h, v30.8h, v0.8h\n"
+        "sqadd  v28.8h, v28.8h, v0.8h\n"
+        "sqxtn  v29.8b, v29.8h\n"
+        "sqxtn2 v29.16b, v28.8h\n"
+        "smax   v28.16b, v29.16b, v1.16b\n"
+        "add    x12, x12, x16\n"
+        "smin   v8.16b, v28.16b, v2.16b\n"
+        "mov    v28.16b, v19.16b\n"
+        "mov    v29.16b, v19.16b\n"
+        "mov    v30.16b, v19.16b\n"
+        "mov    v31.16b, v19.16b\n"
+        "ushr   v24.2d, v24.2d, #32\n"
+        "add    %[output_block_data], x9, x16\n"
+        "str    s8, [x27, x16]\n"
+        "st1    { v8.s }[1], [x12]\n"
+        "add    x12, x8, x16\n"
+        "subs   w13, w13, #1\n"  // =1
+        "ushr   v22.2d, v22.2d, #32\n"
+        "ushr   v23.2d, v23.2d, #32\n"
+        ".word 0x4f98e21c  // sdot   v28.4s, v16.16b, v24.4b[0]\n"
+        ".word 0x4f98e23d  // sdot   v29.4s, v17.16b, v24.4b[0]\n"
+        ".word 0x4f98e25e  // sdot   v30.4s, v18.16b, v24.4b[0]\n"
+        ".word 0x4f98ea5f  // sdot   v31.4s, v18.16b, v24.4b[2]\n"
+        "add    x16, x16, x10\n"
+        "st1    { v8.s }[2], [%[output_block_data]]\n"
+        "st1    { v8.s }[3], [x12]\n"
+        "b.ne   " DC_KERNEL_MULT_10 "b\n"
+        // %bb.11:        // in Loop: Header=BB107_8 Depth=2
+        "ldr    w25, [sp, #212]\n"  // 4-byte Folded Reload
+        "add    x13, x21, x16\n"
+        "mov    %[output_block_data], x21\n"
+        "ldp    x21, %[scratch_block_data], [sp, #256]\n"  // 16-byte Folded Reload
+        "ldr    x6, [sp, #232]\n"  // 8-byte Folded Reload
+        "mov    x27, x28\n"
+        "mov    x28, x24\n"
+        "ldr    x24, [sp, #192]\n"  // 8-byte Folded Reload
+        "ldr    x17, [sp, #176]\n"  // 8-byte Folded Reload
+        "ldp    x15, %[bias_data], [sp, #280]\n"  // 16-byte Folded Reload
+        "ldr    %[filter_workspace], [sp, #272]\n"  // 8-byte Folded Reload
+        "mov    w12, w25\n"
+        "mov    x5, x8\n"
+        "mov    x8, x9\n"
+        "mov    w10, #4\n"
+        "ldr    w16, [sp, #324]\n"  // 4-byte Folded Reload
+        "cmp    w12, w16\n"
+        "b.ge   " DC_KERNEL_MULT_7 "b\n"
+        DC_KERNEL_MULT_12 ":\n"  // in Loop: Header=BB107_8 Depth=2
+        "ldr    w12, [sp, #320]\n"  // 4-byte Folded Reload
+        "cmp    w12, #1\n"  // =1
+        "b.lt   " DC_KERNEL_MULT_7 "b\n"
+        // %bb.13:        // in Loop: Header=BB107_8 Depth=2
+        "add    x12, x14, #4\n"  // =4
+        "ldr    x16, [sp, #328]\n"  // 8-byte Folded Reload
+        "add    x14, x12, x24\n"
+        "ld1    { v23.s }[1], [x14]\n"
+        "add    x14, x12, x17\n"
+        "add    x16, x12, x16\n"
+        "ld1    { v24.s }[1], [x16]\n"
+        "ld1    { v23.s }[3], [x14]\n"
+        "ldp    x16, x14, [sp, #296]\n"  // 16-byte Folded Reload
+        "add    x16, x12, x16\n"
+        "ld1    { v24.s }[3], [x16]\n"
+        "ldr    x16, [sp, #64]\n"  // 8-byte Folded Reload
+        "ld1    { v22.s }[1], [x12], x14\n"
+        "ldr    w14, [sp, #320]\n"  // 4-byte Folded Reload
+        "ld1    { v22.s }[3], [x12]\n"
+        DC_KERNEL_MULT_14 ":\n"  // Parent Loop BB107_5 Depth=1
+        // Parent Loop BB107_8 Depth=2
+        // =>  This Inner Loop Header: Depth=3
+        ".word 0x4f96e25c  // sdot   v28.4s, v18.16b, v22.4b[0]\n"
+        ".word 0x4f96ea5d  // sdot   v29.4s, v18.16b, v22.4b[2]\n"
+        ".word 0x4f98ea3e  // sdot   v30.4s, v17.16b, v24.4b[2]\n"
+        ".word 0x4f96ea3c  // sdot   v28.4s, v17.16b, v22.4b[2]\n"
+        ".word 0x4f97e23f  // sdot   v31.4s, v17.16b, v23.4b[0]\n"
+        ".word 0x4f98ea1d  // sdot   v29.4s, v16.16b, v24.4b[2]\n"
+        ".word 0x4f97e21e  // sdot   v30.4s, v16.16b, v23.4b[0]\n"
+        "sqrdmulh        v25.4s, v28.4s, v21.4s\n"
+        ".word 0x4f97ea1f  // sdot   v31.4s, v16.16b, v23.4b[2]\n"
+        "sqrdmulh        v26.4s, v29.4s, v21.4s\n"
+        "sqrdmulh        v27.4s, v30.4s, v21.4s\n"
+        "sqrshl v25.4s, v25.4s, v20.4s\n"
+        "sqrdmulh        v28.4s, v31.4s, v21.4s\n"
+        "sqrshl v26.4s, v26.4s, v20.4s\n"
+        "sqrshl v27.4s, v27.4s, v20.4s\n"
+        "sqxtn  v25.4h, v25.4s\n"
+        "sqrshl v28.4s, v28.4s, v20.4s\n"
+        "sqxtn  v27.4h, v27.4s\n"
+        "sqxtn2 v25.8h, v26.4s\n"
+        "sqxtn2 v27.8h, v28.4s\n"
+        "sqadd  v25.8h, v25.8h, v0.8h\n"
+        "sqadd  v26.8h, v27.8h, v0.8h\n"
+        "sqxtn  v25.8b, v25.8h\n"
+        "sqxtn2 v25.16b, v26.8h\n"
+        "smax   v25.16b, v25.16b, v1.16b\n"
+        "add    x12, x13, x21\n"
+        "smin   v25.16b, v25.16b, v2.16b\n"
+        "str    s25, [x13]\n"
+        "st1    { v25.s }[1], [x12]\n"
+        "add    x12, x13, x6\n"
+        "ushr   v24.2d, v24.2d, #8\n"
+        "mov    v28.16b, v19.16b\n"
+        "mov    v29.16b, v19.16b\n"
+        "mov    v30.16b, v19.16b\n"
+        "mov    v31.16b, v19.16b\n"
+        "st1    { v25.s }[2], [x12]\n"
+        "add    x12, x13, x16\n"
+        "subs   w14, w14, #1\n"  // =1
+        "ushr   v22.2d, v22.2d, #8\n"
+        "ushr   v23.2d, v23.2d, #8\n"
+        ".word 0x4f98e21c  // sdot   v28.4s, v16.16b, v24.4b[0]\n"
+        ".word 0x4f98e23d  // sdot   v29.4s, v17.16b, v24.4b[0]\n"
+        ".word 0x4f98e25e  // sdot   v30.4s, v18.16b, v24.4b[0]\n"
+        "add    x13, x13, %[function_params]\n"
+        ".word 0x4f98ea5f  // sdot   v31.4s, v18.16b, v24.4b[2]\n"
+        "st1    { v25.s }[3], [x12]\n"
+        "b.ne   " DC_KERNEL_MULT_14 "b\n"
+        "b      " DC_KERNEL_MULT_7 "b\n"
+        DC_KERNEL_MULT_15 ":\n"  // in Loop: Header=BB107_8 Depth=2
+        "ldr    x11, [sp, #72]\n"  // 8-byte Folded Reload
+        "ldr    x6, [sp, #232]\n"  // 8-byte Folded Reload
+        "mov    w12, wzr\n"
+        "mov    x14, %[scratch_block_data]\n"
+        "add    x13, x11, x15, lsl #2\n"
+        "ldr    w16, [sp, #324]\n"  // 4-byte Folded Reload
+        "cmp    w12, w16\n"
+        "b.lt   " DC_KERNEL_MULT_12 "b\n"
+        "b      " DC_KERNEL_MULT_7 "b\n"
+        DC_KERNEL_MULT_16 ":\n"  // in Loop: Header=BB107_5 Depth=1
+        "ldr    w16, [sp, #12]\n"  // 4-byte Folded Reload
+        "add    x11, %[bias_data], #32\n"  // =32
+        "tbnz   w16, #0,    " DC_KERNEL_MULT_3 "b\n"
+        // %bb.17:        // in Loop: Header=BB107_5 Depth=1
+        "ldp    x13, x16, [sp, #16]\n"  // 16-byte Folded Reload
+        "mov    x12, x14\n"
+        "lsl    w12, w12, #3\n"
+        "lsl    x12, x12, #2\n"
+        "add    x13, x13, x12\n"
+        "add    x12, x16, x12\n"
+        "ldp    q19, q20, [%[bias_data]]\n"
+        "ldp    q21, q22, [x13]\n"
+        "ldp    q23, q24, [x12]\n"
+        "ldr    x15, [sp, #72]\n"  // 8-byte Folded Reload
+        "ldr    %[scratch_block_data], [sp, #304]\n"  // 8-byte Folded Reload
+        "mov    x21, %[output_block_data]\n"
+        "mov    x14, xzr\n"
+        "b      " DC_KERNEL_MULT_19 "f\n"
+        DC_KERNEL_MULT_18 ":\n"  // in Loop: Header=BB107_19 Depth=2
+        "ldr    x12, [sp, #80]\n"  // 8-byte Folded Reload
+        "add    x14, x14, #1\n"  // =1
+        "cmp    x14, x12\n"
+        "ldr    x12, [sp, #256]\n"  // 8-byte Folded Reload
+        "add    x15, x15, x12\n"
+        "b.eq   " DC_KERNEL_MULT_2 "b\n"
+        DC_KERNEL_MULT_19 ":\n"  // Parent Loop BB107_5 Depth=1
+        // =>  This Loop Header: Depth=2
+        // Child Loop BB107_21 Depth 3
+        // Child Loop BB107_22 Depth 4
+        "ldr    x12, [sp, #264]\n"  // 8-byte Folded Reload
+        "mov    w13, wzr\n"
+        "madd   x6, x14, %[scratch_block_data], x12\n"
+        "ldr    w12, [x6]\n"
+        "add    x16, x6, %[scratch_block_data]\n"
+        "fmov   s25, w12\n"
+        "mov    v25.s[1], w12\n"
+        "ld1    { v25.s }[2], [x16]\n"
+        "ldr    x16, [sp, #328]\n"  // 8-byte Folded Reload
+        "mov    v25.s[3], w12\n"
+        "add    x16, x6, x16\n"
+        "ld1r   { v26.4s }, [x16]\n"
+        "mov    x16, x15\n"
+        "b      " DC_KERNEL_MULT_21 "f\n"
+        DC_KERNEL_MULT_20 ":\n"  // in Loop: Header=BB107_21 Depth=3
+        "ldr    w12, [sp, #324]\n"  // 4-byte Folded Reload
+        "add    w13, w13, #1\n"  // =1
+        "cmp    w13, w12\n"
+        "b.eq   " DC_KERNEL_MULT_18 "b\n"
+        DC_KERNEL_MULT_21 ":\n"  // Parent Loop BB107_5 Depth=1
+        // Parent Loop BB107_19 Depth=2
+        // =>  This Loop Header: Depth=3
+        // Child Loop BB107_22 Depth 4
+        "ldr    %[output_block_data], [sp, #328]\n"  // 8-byte Folded Reload
+        "add    x6, x6, #4\n"  // =4
+        "mov    x12, x6\n"
+        "ld1    { v25.s }[1], [x12], %[output_block_data]\n"
+        "ldr    w3, [sp, #316]\n"  // 4-byte Folded Reload
+        "ld1    { v26.s }[1], [x12]\n"
+        "ldr    w12, [sp, #320]\n"  // 4-byte Folded Reload
+        "cmp    w13, w3\n"
+        "add    %[output_block_data], x6, %[scratch_block_data]\n"
+        "ld1    { v25.s }[3], [%[output_block_data]]\n"
+        "csel   w12, w12, w10, eq\n"
+        "cmp    w12, #1\n"  // =1
+        "b.lt   " DC_KERNEL_MULT_20 "b\n"
+        DC_KERNEL_MULT_22 ":\n"  // Parent Loop BB107_5 Depth=1
+        // Parent Loop BB107_19 Depth=2
+        // Parent Loop BB107_21 Depth=3
+        // =>  This Inner Loop Header: Depth=4
+        "mov    v27.16b, v19.16b\n"
+        "mov    v28.16b, v20.16b\n"
+        ".word 0x4f99e25b  // sdot   v27.4s, v18.16b, v25.4b[0]\n"
+        ".word 0x4f99e0bc  // sdot   v28.4s, v5.16b, v25.4b[0]\n"
+        ".word 0x4f99ea3b  // sdot   v27.4s, v17.16b, v25.4b[2]\n"
+        ".word 0x4f99e8dc  // sdot   v28.4s, v6.16b, v25.4b[2]\n"
+        ".word 0x4f9ae21b  // sdot   v27.4s, v16.16b, v26.4b[0]\n"
+        ".word 0x4f9ae0fc  // sdot   v28.4s, v7.16b, v26.4b[0]\n"
+        "sqrdmulh        v27.4s, v27.4s, v23.4s\n"
+        "sqrdmulh        v28.4s, v28.4s, v24.4s\n"
+        "sqrshl v27.4s, v27.4s, v21.4s\n"
+        "sqrshl v28.4s, v28.4s, v22.4s\n"
+        "sqxtn  v27.4h, v27.4s\n"
+        "sqxtn2 v27.8h, v28.4s\n"
+        "sqadd  v27.8h, v27.8h, v0.8h\n"
+        "sqxtn  v27.8b, v27.8h\n"
+        "smax   v27.8b, v27.8b, v3.8b\n"
+        "smin   v27.8b, v27.8b, v4.8b\n"
+        "subs   w12, w12, #1\n"  // =1
+        "ushr   v25.2d, v25.2d, #8\n"
+        "ushr   v26.2d, v26.2d, #8\n"
+        "str    d27, [x16]\n"
+        "add    x16, x16, %[function_params]\n"
+        "b.ne   " DC_KERNEL_MULT_22 "b\n"
+        "b      " DC_KERNEL_MULT_20 "b\n"
+        DC_KERNEL_MULT_23 ":\n"
+
+
+        // Compiled intrinsics total stack 448, now 352 for spillage only.
+        "add    sp, sp, #352\n"  // =448
+        :
+        // Outputs.
+        [ scratch_block_data ] "+r"(scratch_block_data),
+        [ filter_workspace ] "+r"(filter_workspace),
+        [ bias_data ] "+r"(bias_data),
+        [ output_block_data ] "+r"(output_block_data)
+        :
+        // Inputs.
+        [ function_params ] "r"(function_params)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v16", "v17", "v18", "v19", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+        "v31",
+        // We use these general-purpose registers.
+        "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+        "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
+        "x27", "x28");
+
+#undef DC_KERNEL_MULT_1
+#undef DC_KERNEL_MULT_2
+#undef DC_KERNEL_MULT_3
+#undef DC_KERNEL_MULT_4
+#undef DC_KERNEL_MULT_5
+#undef DC_KERNEL_MULT_6
+#undef DC_KERNEL_MULT_7
+#undef DC_KERNEL_MULT_8
+#undef DC_KERNEL_MULT_9
+#undef DC_KERNEL_MULT_10
+#undef DC_KERNEL_MULT_11
+#undef DC_KERNEL_MULT_12
+#undef DC_KERNEL_MULT_13
+#undef DC_KERNEL_MULT_14
+#undef DC_KERNEL_MULT_15
+#undef DC_KERNEL_MULT_16
+#undef DC_KERNEL_MULT_17
+#undef DC_KERNEL_MULT_18
+#undef DC_KERNEL_MULT_19
+#undef DC_KERNEL_MULT_20
+#undef DC_KERNEL_MULT_21
+#undef DC_KERNEL_MULT_22
+#undef DC_KERNEL_MULT_23
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         int8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
+                         output_block_data, function_params);
+  }
+};
+
+template <>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseNeon3x3DotProduct,
+                        QuantizationType::kPerChannelInt8,
+                        DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                        /*stride=*/2> {
+  static inline void KernelMacroBlockNeon(
+      const int8* scratch_block_data, const int8* filter_workspace,
+      const int32* bias_data, int8* output_block_data,
+      const DepthwiseConvDotProdParams* function_params) {
+    // Note that argument registers may be reused after parameter loading.
+    // x0 %[scratch_block_data]
+    // x1 %[filter_workspace]
+    // x2 %[bias_data]
+    // x3 %[output_block_data]
+    // x4 %[function_params]
+#define DC_KERNEL_MULT_STRIDE_1 "1"
+#define DC_KERNEL_MULT_STRIDE_2 "2"
+#define DC_KERNEL_MULT_STRIDE_3 "3"
+#define DC_KERNEL_MULT_STRIDE_4 "4"
+#define DC_KERNEL_MULT_STRIDE_5 "5"
+#define DC_KERNEL_MULT_STRIDE_6 "6"
+#define DC_KERNEL_MULT_STRIDE_7 "7"
+#define DC_KERNEL_MULT_STRIDE_8 "8"
+#define DC_KERNEL_MULT_STRIDE_9 "9"
+#define DC_KERNEL_MULT_STRIDE_10 "10"
+#define DC_KERNEL_MULT_STRIDE_11 "11"
+#define DC_KERNEL_MULT_STRIDE_12 "12"
+#define DC_KERNEL_MULT_STRIDE_13 "13"
+#define DC_KERNEL_MULT_STRIDE_14 "14"
+#define DC_KERNEL_MULT_STRIDE_15 "15"
+#define DC_KERNEL_MULT_STRIDE_16 "16"
+#define DC_KERNEL_MULT_STRIDE_17 "17"
+#define DC_KERNEL_MULT_STRIDE_18 "18"
+
+    asm volatile(
+        // Compiled code used block of 32 for spill out of total stack of 112.
+        "sub    sp, sp, #32\n"  // =112
+
+
+        "ldr    w8, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
+        "cmp    w8, #1\n"  // =1
+        "b.lt   " DC_KERNEL_MULT_STRIDE_18 "f\n"
+        // %bb.1:
+        "ldr    w7, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
+        "ldp    w12, w22, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
+        "ldpsw  x10, x11, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
+        "ldrsw  x17, [%[function_params], #" STR(DP_OFFSET_OUTPUT_DEPTH) "]\n"
+        "add    x13, %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n"  // =28
+        "add    x14, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "\n"  // =44
+        "add    x6, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "\n"  // =40
+        "cmp    w7, #2\n"  // =2
+        "ldp    x15, x16, [%[function_params], #" STR(DP_OFFSET_OUTPUT_MULTPLIPLIER_PER_CHANNEL) "]\n"
+        "ldr    w4, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
+        "ld1r   { v0.8h }, [x13]\n"
+        "ld1r   { v1.8b }, [x6]\n"
+        "ld1r   { v2.8b }, [x14]\n"
+        "ccmp   w22, w12, #0, lt\n"
+        "add    x13, x10, x17\n"
+        "str    x22, [sp]\n"  // 8-byte Folded Spill
+        "csel   w22, w22, w12, lt\n"
+        "lsl    x6, x11, #1\n"
+        "add    x21, x13, #4\n"  // =4
+        "bic    w13, w22, w22, asr #31\n"
+        "mov    x9, xzr\n"
+        "add    x5, %[scratch_block_data], #4\n"  // =4
+        "str    w7, [sp, #12]\n"  // 4-byte Folded Spill
+        "add    x7, x17, #4\n"  // =4
+        "add    x19, x10, #4\n"  // =4
+        "add    x20, x6, x11\n"
+        "lsl    x14, x13, #2\n"
+        "sub    x13, x12, x13\n"
+        "stp    x13, x14, [sp, #16]\n"  // 16-byte Folded Spill
+        "b      " DC_KERNEL_MULT_STRIDE_3 "f\n"
+        DC_KERNEL_MULT_STRIDE_2 ":\n"  // in Loop: Header=BB108_3 Depth=1
+        "add    x9, x9, #1\n"  // =1
+        "cmp    x9, x8\n"
+        "b.eq   " DC_KERNEL_MULT_STRIDE_18 "f\n"
+        DC_KERNEL_MULT_STRIDE_3 ":\n"  // =>This Loop Header: Depth=1
+        // Child Loop BB108_16 Depth 2
+        // Child Loop BB108_11 Depth 2
+        // Child Loop BB108_6 Depth 2
+        // Child Loop BB108_13 Depth 2
+        "lsl    w13, w9, #3\n"
+        "lsl    x14, x13, #2\n"
+        "add    x23, x16, x14\n"
+        "ldp    q19, q20, [x23]\n"
+        "ldr    w23, [%[scratch_block_data]]\n"
+        "add    x14, x15, x14\n"
+        "ldp    q21, q22, [x14]\n"
+        "add    x14, %[scratch_block_data], x11\n"
+        "fmov   s23, w23\n"
+        "mov    v23.s[1], w23\n"
+        "ld1    { v23.s }[2], [x14]\n"
+        "ldp    q3, q4, [%[filter_workspace]]\n"
+        "ldp    q5, q6, [%[filter_workspace], #32]\n"
+        "ldp    q7, q16, [%[filter_workspace], #64]\n"
+        "ldp    q17, q18, [%[bias_data]], #32\n"
+        "ldr    s24, [%[scratch_block_data], x6]\n"
+        "add    %[filter_workspace], x1, #96\n"  // =96
+        "add    x25, %[output_block_data], x13\n"
+        "cmp    w4, #2\n"  // =2
+        "mov    v23.s[3], w23\n"
+        "b.ne   " DC_KERNEL_MULT_STRIDE_8 "f\n"
+        // %bb.4:        // in Loop: Header=BB108_3 Depth=1
+        "dup    v24.4s, v24.s[0]\n"
+        "add    x13, %[scratch_block_data], x20\n"
+        "add    x14, %[scratch_block_data], x11, lsl #2\n"
+        "ld1    { v24.s }[2], [x13]\n"
+        "ld1r   { v25.4s }, [x14]\n"
+        "cmp    w22, #1\n"  // =1
+        "lsl    x26, x11, #2\n"
+        "b.lt   " DC_KERNEL_MULT_STRIDE_12 "f\n"
+        // %bb.5:        // in Loop: Header=BB108_3 Depth=1
+        "mov    x27, xzr\n"
+        "mov    x28, x22\n"
+        DC_KERNEL_MULT_STRIDE_6 ":\n"  // Parent Loop BB108_3 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "and    x13, x27, #0xfffffffc\n"
+        "add    x13, x5, x13\n"
+        "mov    x23, x13\n"
+        "ld1    { v23.s }[1], [x23], x26\n"
+        "add    x24, x13, x6\n"
+        "ld1    { v24.s }[1], [x24]\n"
+        "add    x14, x13, x11\n"
+        "add    x24, x13, x20\n"
+        "ld1    { v23.s }[3], [x14]\n"
+        "ld1    { v24.s }[3], [x24]\n"
+        "mov    v27.16b, v17.16b\n"
+        "ld1    { v25.s }[1], [x23]\n"
+        "mov    v28.16b, v17.16b\n"
+        ".word 0x4f97e07b  // sdot   v27.4s, v3.16b, v23.4b[0]\n"
+        ".word 0x4f98e07c  // sdot   v28.4s, v3.16b, v24.4b[0]\n"
+        ".word 0x4f97e8bb  // sdot   v27.4s, v5.16b, v23.4b[2]\n"
+        ".word 0x4f98e8bc  // sdot   v28.4s, v5.16b, v24.4b[2]\n"
+        ".word 0x4f98e0fb  // sdot   v27.4s, v7.16b, v24.4b[0]\n"
+        ".word 0x4f99e0fc  // sdot   v28.4s, v7.16b, v25.4b[0]\n"
+        "sqrdmulh        v27.4s, v27.4s, v21.4s\n"
+        "sqrdmulh        v28.4s, v28.4s, v21.4s\n"
+        "sqrshl v27.4s, v27.4s, v19.4s\n"
+        "sqrshl v28.4s, v28.4s, v19.4s\n"
+        "sqxtn  v31.4h, v27.4s\n"
+        "sqxtn2 v31.8h, v28.4s\n"
+        "mov    v29.16b, v18.16b\n"
+        "sqadd  v28.8h, v31.8h, v0.8h\n"
+        "mov    v30.16b, v18.16b\n"
+        "sqxtn  v28.8b, v28.8h\n"
+        ".word 0x4f97e09d  // sdot   v29.4s, v4.16b, v23.4b[0]\n"
+        "add    x13, x25, x19\n"
+        "smax   v28.8b, v28.8b, v1.8b\n"
+        ".word 0x4f98e09e  // sdot   v30.4s, v4.16b, v24.4b[0]\n"
+        ".word 0x4f97e8dd  // sdot   v29.4s, v6.16b, v23.4b[2]\n"
+        "sub    x23, x13, #4\n"  // =4
+        "smin   v28.8b, v28.8b, v2.8b\n"
+        ".word 0x4f98e8de  // sdot   v30.4s, v6.16b, v24.4b[2]\n"
+        ".word 0x4f98e21d  // sdot   v29.4s, v16.16b, v24.4b[0]\n"
+        "str    s28, [x25]\n"
+        "st1    { v28.s }[1], [x23]\n"
+        ".word 0x4f99e21e  // sdot   v30.4s, v16.16b, v25.4b[0]\n"
+        "sqrdmulh        v28.4s, v29.4s, v22.4s\n"
+        "sqrdmulh        v29.4s, v30.4s, v22.4s\n"
+        "sqrshl v28.4s, v28.4s, v20.4s\n"
+        "sqrshl v29.4s, v29.4s, v20.4s\n"
+        "sqxtn  v28.4h, v28.4s\n"
+        "sqxtn2 v28.8h, v29.4s\n"
+        "sqadd  v28.8h, v28.8h, v0.8h\n"
+        "sqxtn  v28.8b, v28.8h\n"
+        "smax   v28.8b, v28.8b, v1.8b\n"
+        "smin   v28.8b, v28.8b, v2.8b\n"
+        "mov    v26.16b, v17.16b\n"
+        "str    s28, [x25, #4]\n"
+        "mov    v29.16b, v18.16b\n"
+        "st1    { v28.s }[1], [x13]\n"
+        "ushr   v28.2d, v23.2d, #16\n"
+        ".word 0x4f9ce07a  // sdot   v26.4s, v3.16b, v28.4b[0]\n"
+        ".word 0x4f9ce09d  // sdot   v29.4s, v4.16b, v28.4b[0]\n"
+        "mov    v27.16b, v17.16b\n"
+        "mov    v30.16b, v18.16b\n"
+        ".word 0x4f9ce8ba  // sdot   v26.4s, v5.16b, v28.4b[2]\n"
+        ".word 0x4f9ce8dd  // sdot   v29.4s, v6.16b, v28.4b[2]\n"
+        "ushr   v28.2d, v24.2d, #16\n"
+        ".word 0x4f9ce07b  // sdot   v27.4s, v3.16b, v28.4b[0]\n"
+        ".word 0x4f9ce09e  // sdot   v30.4s, v4.16b, v28.4b[0]\n"
+        ".word 0x4f9ce8bb  // sdot   v27.4s, v5.16b, v28.4b[2]\n"
+        ".word 0x4f9ce8de  // sdot   v30.4s, v6.16b, v28.4b[2]\n"
+        ".word 0x4f9ce0fa  // sdot   v26.4s, v7.16b, v28.4b[0]\n"
+        ".word 0x4f9ce21d  // sdot   v29.4s, v16.16b, v28.4b[0]\n"
+        "ushr   v28.2d, v25.2d, #16\n"
+        ".word 0x4f9ce0fb  // sdot   v27.4s, v7.16b, v28.4b[0]\n"
+        "sqrdmulh        v26.4s, v26.4s, v21.4s\n"
+        "sqrdmulh        v27.4s, v27.4s, v21.4s\n"
+        "sqrshl v26.4s, v26.4s, v19.4s\n"
+        "sqrshl v27.4s, v27.4s, v19.4s\n"
+        "sqxtn  v26.4h, v26.4s\n"
+        "sqxtn2 v26.8h, v27.4s\n"
+        "sqadd  v26.8h, v26.8h, v0.8h\n"
+        ".word 0x4f9ce21e  // sdot   v30.4s, v16.16b, v28.4b[0]\n"
+        "sqrdmulh        v28.4s, v29.4s, v22.4s\n"
+        "sqxtn  v26.8b, v26.8h\n"
+        "add    x24, x25, x21\n"
+        "sqrdmulh        v29.4s, v30.4s, v22.4s\n"
+        "sqrshl v28.4s, v28.4s, v20.4s\n"
+        "smax   v26.8b, v26.8b, v1.8b\n"
+        "add    x23, x25, x7\n"
+        "sub    x13, x24, #4\n"  // =4
+        "sqrshl v29.4s, v29.4s, v20.4s\n"
+        "sqxtn  v28.4h, v28.4s\n"
+        "smin   v26.8b, v26.8b, v2.8b\n"
+        "stur   s26, [x23, #-4]\n"
+        "st1    { v26.s }[1], [x13]\n"
+        "sqxtn2 v28.8h, v29.4s\n"
+        "sqadd  v26.8h, v28.8h, v0.8h\n"
+        "sqxtn  v26.8b, v26.8h\n"
+        "add    x14, x25, x17\n"
+        "smax   v26.8b, v26.8b, v1.8b\n"
+        "subs   x28, x28, #1\n"  // =1
+        "ushr   v23.2d, v23.2d, #32\n"
+        "ushr   v24.2d, v24.2d, #32\n"
+        "ushr   v25.2d, v25.2d, #32\n"
+        "add    x25, x14, x17\n"
+        "smin   v26.8b, v26.8b, v2.8b\n"
+        "add    x27, x27, #4\n"  // =4
+        "str    s26, [x23]\n"
+        "st1    { v26.s }[1], [x24]\n"
+        "b.ne   " DC_KERNEL_MULT_STRIDE_6 "b\n"
+        // %bb.7:        // in Loop: Header=BB108_3 Depth=1
+        "mov    w13, w22\n"
+        "cmp    w13, w12\n"
+        "ldp    x13, x27, [sp, #16]\n"  // 16-byte Folded Reload
+        "b.lt   " DC_KERNEL_MULT_STRIDE_13 "f\n"
+        "b      " DC_KERNEL_MULT_STRIDE_2 "b\n"
+        DC_KERNEL_MULT_STRIDE_8 ":\n"  // in Loop: Header=BB108_3 Depth=1
+        "cmp    w12, #1\n"  // =1
+        "b.lt   " DC_KERNEL_MULT_STRIDE_2 "b\n"
+        // %bb.9:        // in Loop: Header=BB108_3 Depth=1
+        "ldr    w13, [sp, #12]\n"  // 4-byte Folded Reload
+        "dup    v24.4s, v24.s[0]\n"
+        "cmp    w13, #2\n"  // =2
+        "b.ne   " DC_KERNEL_MULT_STRIDE_14 "f\n"
+        // %bb.10:        // in Loop: Header=BB108_3 Depth=1
+        "mov    x26, xzr\n"
+        "mov    x13, x12\n"
+        DC_KERNEL_MULT_STRIDE_11 ":\n"  // Parent Loop BB108_3 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "and    x14, x26, #0xfffffffc\n"
+        "add    x14, x5, x14\n"
+        "mov    x23, x14\n"
+        "ld1    { v23.s }[1], [x23], x6\n"
+        "add    x14, x14, x11\n"
+        "mov    v26.16b, v17.16b\n"
+        "mov    v27.16b, v18.16b\n"
+        "ld1    { v24.s }[1], [x23]\n"
+        "ld1    { v23.s }[3], [x14]\n"
+        "mov    v25.16b, v17.16b\n"
+        "add    x14, x25, x17\n"
+        "ushr   v28.2d, v24.2d, #16\n"
+        ".word 0x4f9ce0fa  // sdot   v26.4s, v7.16b, v28.4b[0]\n"
+        ".word 0x4f9ce21b  // sdot   v27.4s, v16.16b, v28.4b[0]\n"
+        "ushr   v28.2d, v23.2d, #16\n"
+        ".word 0x4f9ce07a  // sdot   v26.4s, v3.16b, v28.4b[0]\n"
+        ".word 0x4f9ce09b  // sdot   v27.4s, v4.16b, v28.4b[0]\n"
+        ".word 0x4f9ce8ba  // sdot   v26.4s, v5.16b, v28.4b[2]\n"
+        ".word 0x4f9ce8db  // sdot   v27.4s, v6.16b, v28.4b[2]\n"
+        "mov    v28.16b, v18.16b\n"
+        ".word 0x4f98e0f9  // sdot   v25.4s, v7.16b, v24.4b[0]\n"
+        ".word 0x4f98e21c  // sdot   v28.4s, v16.16b, v24.4b[0]\n"
+        ".word 0x4f97e079  // sdot   v25.4s, v3.16b, v23.4b[0]\n"
+        ".word 0x4f97e09c  // sdot   v28.4s, v4.16b, v23.4b[0]\n"
+        ".word 0x4f97e8b9  // sdot   v25.4s, v5.16b, v23.4b[2]\n"
+        ".word 0x4f97e8dc  // sdot   v28.4s, v6.16b, v23.4b[2]\n"
+        "sqrdmulh        v25.4s, v25.4s, v21.4s\n"
+        "sqrdmulh        v28.4s, v28.4s, v22.4s\n"
+        "sqrshl v25.4s, v25.4s, v19.4s\n"
+        "sqrshl v28.4s, v28.4s, v20.4s\n"
+        "sqxtn  v25.4h, v25.4s\n"
+        "sqxtn2 v25.8h, v28.4s\n"
+        "sqadd  v25.8h, v25.8h, v0.8h\n"
+        "sqrdmulh        v26.4s, v26.4s, v21.4s\n"
+        "sqxtn  v25.8b, v25.8h\n"
+        "sqrdmulh        v27.4s, v27.4s, v22.4s\n"
+        "sqrshl v26.4s, v26.4s, v19.4s\n"
+        "smax   v25.8b, v25.8b, v1.8b\n"
+        "sqrshl v27.4s, v27.4s, v20.4s\n"
+        "sqxtn  v26.4h, v26.4s\n"
+        "smin   v25.8b, v25.8b, v2.8b\n"
+        "str    d25, [x25]\n"
+        "sqxtn2 v26.8h, v27.4s\n"
+        "sqadd  v25.8h, v26.8h, v0.8h\n"
+        "sqxtn  v25.8b, v25.8h\n"
+        "smax   v25.8b, v25.8b, v1.8b\n"
+        "smin   v25.8b, v25.8b, v2.8b\n"
+        "subs   x13, x13, #1\n"  // =1
+        "ushr   v24.2d, v24.2d, #32\n"
+        "ushr   v23.2d, v23.2d, #32\n"
+        "str    d25, [x25, x17]\n"
+        "add    x25, x14, x17\n"
+        "add    x26, x26, #4\n"  // =4
+        "b.ne   " DC_KERNEL_MULT_STRIDE_11 "b\n"
+        "b      " DC_KERNEL_MULT_STRIDE_2 "b\n"
+        DC_KERNEL_MULT_STRIDE_12 ":\n"  // in Loop: Header=BB108_3 Depth=1
+        "mov    w13, wzr\n"
+        "cmp    w13, w12\n"
+        "ldp    x13, x27, [sp, #16]\n"  // 16-byte Folded Reload
+        "b.ge   " DC_KERNEL_MULT_STRIDE_2 "b\n"
+        DC_KERNEL_MULT_STRIDE_13 ":\n"  // Parent Loop BB108_3 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "and    x14, x27, #0xfffffffc\n"
+        "add    x14, x5, x14\n"
+        "mov    x24, x14\n"
+        "add    x23, x14, x6\n"
+        "ld1    { v23.s }[1], [x24], x26\n"
+        "ld1    { v24.s }[1], [x23]\n"
+        "add    x23, x14, x11\n"
+        "add    x14, x14, x20\n"
+        "ld1    { v23.s }[3], [x23]\n"
+        "ld1    { v24.s }[3], [x14]\n"
+        "mov    v26.16b, v17.16b\n"
+        "ld1    { v25.s }[1], [x24]\n"
+        "mov    v27.16b, v17.16b\n"
+        ".word 0x4f97e07a  // sdot   v26.4s, v3.16b, v23.4b[0]\n"
+        ".word 0x4f98e07b  // sdot   v27.4s, v3.16b, v24.4b[0]\n"
+        ".word 0x4f97e8ba  // sdot   v26.4s, v5.16b, v23.4b[2]\n"
+        ".word 0x4f98e8bb  // sdot   v27.4s, v5.16b, v24.4b[2]\n"
+        ".word 0x4f98e0fa  // sdot   v26.4s, v7.16b, v24.4b[0]\n"
+        ".word 0x4f99e0fb  // sdot   v27.4s, v7.16b, v25.4b[0]\n"
+        "sqrdmulh        v26.4s, v26.4s, v21.4s\n"
+        "sqrdmulh        v27.4s, v27.4s, v21.4s\n"
+        "sqrshl v26.4s, v26.4s, v19.4s\n"
+        "sqrshl v27.4s, v27.4s, v19.4s\n"
+        "sqxtn  v26.4h, v26.4s\n"
+        "sqxtn2 v26.8h, v27.4s\n"
+        "sqadd  v26.8h, v26.8h, v0.8h\n"
+        "sqxtn  v26.8b, v26.8h\n"
+        "smax   v26.8b, v26.8b, v1.8b\n"
+        "add    x14, x25, x10\n"
+        "mov    v27.16b, v18.16b\n"
+        "smin   v26.8b, v26.8b, v2.8b\n"
+        "str    s26, [x25]\n"
+        "st1    { v26.s }[1], [x14]\n"
+        "mov    v26.16b, v18.16b\n"
+        ".word 0x4f97e09b  // sdot   v27.4s, v4.16b, v23.4b[0]\n"
+        ".word 0x4f98e09a  // sdot   v26.4s, v4.16b, v24.4b[0]\n"
+        ".word 0x4f97e8db  // sdot   v27.4s, v6.16b, v23.4b[2]\n"
+        ".word 0x4f98e8da  // sdot   v26.4s, v6.16b, v24.4b[2]\n"
+        ".word 0x4f98e21b  // sdot   v27.4s, v16.16b, v24.4b[0]\n"
+        ".word 0x4f99e21a  // sdot   v26.4s, v16.16b, v25.4b[0]\n"
+        "sqrdmulh        v27.4s, v27.4s, v22.4s\n"
+        "sqrdmulh        v26.4s, v26.4s, v22.4s\n"
+        "sqrshl v27.4s, v27.4s, v20.4s\n"
+        "sqrshl v26.4s, v26.4s, v20.4s\n"
+        "sqxtn  v27.4h, v27.4s\n"
+        "sqxtn2 v27.8h, v26.4s\n"
+        "sqadd  v26.8h, v27.8h, v0.8h\n"
+        "sqxtn  v26.8b, v26.8h\n"
+        "smax   v26.8b, v26.8b, v1.8b\n"
+        "smin   v26.8b, v26.8b, v2.8b\n"
+        "subs   x13, x13, #1\n"  // =1
+        "add    x14, x14, #4\n"  // =4
+        "ushr   v23.2d, v23.2d, #16\n"
+        "ushr   v24.2d, v24.2d, #16\n"
+        "ushr   v25.2d, v25.2d, #16\n"
+        "str    s26, [x25, #4]\n"
+        "add    x25, x25, x17\n"
+        "add    x27, x27, #4\n"  // =4
+        "st1    { v26.s }[1], [x14]\n"
+        "b.ne   " DC_KERNEL_MULT_STRIDE_13 "b\n"
+        "b      " DC_KERNEL_MULT_STRIDE_2 "b\n"
+        DC_KERNEL_MULT_STRIDE_14 ":\n"  // in Loop: Header=BB108_3 Depth=1
+        "ldr    x27, [sp]\n"  // 8-byte Folded Reload
+        "mov    x13, xzr\n"
+        "mov    x26, x12\n"
+        "b      " DC_KERNEL_MULT_STRIDE_16 "f\n"
+        DC_KERNEL_MULT_STRIDE_15 ":\n"  // in Loop: Header=BB108_16 Depth=2
+        "add    x13, x13, #4\n"  // =4
+        "subs   x26, x26, #1\n"  // =1
+        "sub    x27, x27, #1\n"  // =1
+        "mov    v23.16b, v25.16b\n"
+        "mov    v24.16b, v26.16b\n"
+        "b.eq   " DC_KERNEL_MULT_STRIDE_2 "b\n"
+        DC_KERNEL_MULT_STRIDE_16 ":\n"  // Parent Loop BB108_3 Depth=1
+        // =>  This Inner Loop Header: Depth=2
+        "and    x14, x13, #0xfffffffc\n"
+        "add    x14, x5, x14\n"
+        "mov    x23, x14\n"
+        "ld1    { v23.s }[1], [x23], x6\n"
+        "add    x14, x14, x11\n"
+        "mov    v25.16b, v17.16b\n"
+        "mov    v26.16b, v18.16b\n"
+        "ld1    { v24.s }[1], [x23]\n"
+        "ld1    { v23.s }[3], [x14]\n"
+        ".word 0x4f98e0f9  // sdot   v25.4s, v7.16b, v24.4b[0]\n"
+        ".word 0x4f98e21a  // sdot   v26.4s, v16.16b, v24.4b[0]\n"
+        ".word 0x4f97e079  // sdot   v25.4s, v3.16b, v23.4b[0]\n"
+        ".word 0x4f97e09a  // sdot   v26.4s, v4.16b, v23.4b[0]\n"
+        ".word 0x4f97e8b9  // sdot   v25.4s, v5.16b, v23.4b[2]\n"
+        ".word 0x4f97e8da  // sdot   v26.4s, v6.16b, v23.4b[2]\n"
+        "sqrdmulh        v25.4s, v25.4s, v21.4s\n"
+        "sqrdmulh        v26.4s, v26.4s, v22.4s\n"
+        "sqrshl v25.4s, v25.4s, v19.4s\n"
+        "sqrshl v26.4s, v26.4s, v20.4s\n"
+        "sqxtn  v27.4h, v25.4s\n"
+        "sqxtn2 v27.8h, v26.4s\n"
+        "sqadd  v26.8h, v27.8h, v0.8h\n"
+        "sqxtn  v26.8b, v26.8h\n"
+        "smax   v26.8b, v26.8b, v1.8b\n"
+        "smin   v26.8b, v26.8b, v2.8b\n"
+        "ushr   v25.2d, v23.2d, #16\n"
+        "str    d26, [x25]\n"
+        "ushr   v26.2d, v24.2d, #16\n"
+        "add    x25, x25, x17\n"
+        "cbz    x27,    " DC_KERNEL_MULT_STRIDE_15 "b\n"
+        // %bb.17:        // in Loop: Header=BB108_16 Depth=2
+        "mov    v27.16b, v17.16b\n"
+        "mov    v28.16b, v18.16b\n"
+        ".word 0x4f9ae0fb  // sdot   v27.4s, v7.16b, v26.4b[0]\n"
+        ".word 0x4f9ae21c  // sdot   v28.4s, v16.16b, v26.4b[0]\n"
+        ".word 0x4f99e07b  // sdot   v27.4s, v3.16b, v25.4b[0]\n"
+        ".word 0x4f99e09c  // sdot   v28.4s, v4.16b, v25.4b[0]\n"
+        ".word 0x4f99e8bb  // sdot   v27.4s, v5.16b, v25.4b[2]\n"
+        ".word 0x4f99e8dc  // sdot   v28.4s, v6.16b, v25.4b[2]\n"
+        "ushr   v25.2d, v23.2d, #32\n"
+        "sqrdmulh        v23.4s, v27.4s, v21.4s\n"
+        "ushr   v26.2d, v24.2d, #32\n"
+        "sqrdmulh        v24.4s, v28.4s, v22.4s\n"
+        "sqrshl v23.4s, v23.4s, v19.4s\n"
+        "sqrshl v24.4s, v24.4s, v20.4s\n"
+        "sqxtn  v23.4h, v23.4s\n"
+        "sqxtn2 v23.8h, v24.4s\n"
+        "sqadd  v23.8h, v23.8h, v0.8h\n"
+        "sqxtn  v23.8b, v23.8h\n"
+        "smax   v23.8b, v23.8b, v1.8b\n"
+        "smin   v23.8b, v23.8b, v2.8b\n"
+        "str    d23, [x25]\n"
+        "add    x25, x25, x17\n"
+        "b      " DC_KERNEL_MULT_STRIDE_15 "b\n"
+        DC_KERNEL_MULT_STRIDE_18 ":\n"
+
+        // Compiled intrinsics total stack 112, now 32 for spillage only.
+        "add    sp, sp, #32\n"  // =112
+        :
+        // Outputs.
+        [ scratch_block_data ] "+r"(scratch_block_data),
+        [ filter_workspace ] "+r"(filter_workspace),
+        [ bias_data ] "+r"(bias_data),
+        [ output_block_data ] "+r"(output_block_data)
+        :
+        // Inputs.
+        [ function_params ] "r"(function_params)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+        "v16", "v17", "v18", "v19", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
+        "v31",
+        // We use these general-purpose registers.
+        "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+        "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
+        "x27", "x28");
+
+#undef DC_KERNEL_MULT_STRIDE_1
+#undef DC_KERNEL_MULT_STRIDE_2
+#undef DC_KERNEL_MULT_STRIDE_3
+#undef DC_KERNEL_MULT_STRIDE_4
+#undef DC_KERNEL_MULT_STRIDE_5
+#undef DC_KERNEL_MULT_STRIDE_6
+#undef DC_KERNEL_MULT_STRIDE_7
+#undef DC_KERNEL_MULT_STRIDE_8
+#undef DC_KERNEL_MULT_STRIDE_9
+#undef DC_KERNEL_MULT_STRIDE_10
+#undef DC_KERNEL_MULT_STRIDE_11
+#undef DC_KERNEL_MULT_STRIDE_12
+#undef DC_KERNEL_MULT_STRIDE_13
+#undef DC_KERNEL_MULT_STRIDE_14
+#undef DC_KERNEL_MULT_STRIDE_15
+#undef DC_KERNEL_MULT_STRIDE_16
+#undef DC_KERNEL_MULT_STRIDE_17
+#undef DC_KERNEL_MULT_STRIDE_18
+  }  // NOLINT(readability/fn_size) Manually unrolled.
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         int8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    KernelMacroBlockNeon(scratch_block_data, filter_workspace, bias_data,
+                         output_block_data, function_params);
+  }
+};
+
 #undef DP_OFFSET_INPUT_DEPTH
 #undef DP_OFFSET_OUTPUT_DEPTH
 #undef DP_OFFSET_STRIDE

From bee25f366baad85c3c35848c8e082be6ac79d71e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 09:46:40 -0800
Subject: [PATCH 128/442] Tweak tolerance in brittle parallel for test.

PiperOrigin-RevId: 295754745
Change-Id: I6e91c283c96625200aeb5bee258c69477ec44436
---
 tensorflow/python/ops/parallel_for/test_util.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/parallel_for/test_util.py b/tensorflow/python/ops/parallel_for/test_util.py
index 35d487f4318..c8eed9ca54e 100644
--- a/tensorflow/python/ops/parallel_for/test_util.py
+++ b/tensorflow/python/ops/parallel_for/test_util.py
@@ -38,13 +38,14 @@ class PForTestCase(test.TestCase):
       self.evaluate(init)
     return self.evaluate(targets1 + targets2)
 
+  # TODO(agarwal): Allow tests to pass down tolerances.
   def run_and_assert_equal(self, targets1, targets2):
     outputs = self._run_targets(targets1, targets2)
     outputs = nest.flatten(outputs)  # flatten SparseTensorValues
     n = len(outputs) // 2
     for i in range(n):
       if outputs[i + n].dtype != np.object:
-        self.assertAllClose(outputs[i + n], outputs[i], rtol=1e-4, atol=1e-5)
+        self.assertAllClose(outputs[i + n], outputs[i], rtol=1e-4, atol=1e-4)
       else:
         self.assertAllEqual(outputs[i + n], outputs[i])
 

From b3ec9ff6bf6bac8ecd2a350296098166ac943962 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 09:50:05 -0800
Subject: [PATCH 129/442] [tf.data] Add metric to track time spent in
 IteratorResource::GetNext().

PiperOrigin-RevId: 295755458
Change-Id: I63a0f11794a5dbbc40a4e036d46af8a6a1ed2519
---
 tensorflow/core/common_runtime/metrics.cc    | 12 ++++++++++++
 tensorflow/core/common_runtime/metrics.h     |  3 +++
 tensorflow/core/kernels/data/iterator_ops.cc |  3 +++
 3 files changed, 18 insertions(+)

diff --git a/tensorflow/core/common_runtime/metrics.cc b/tensorflow/core/common_runtime/metrics.cc
index efe0a58a26b..f05f9312b50 100644
--- a/tensorflow/core/common_runtime/metrics.cc
+++ b/tensorflow/core/common_runtime/metrics.cc
@@ -69,6 +69,12 @@ auto* tf_data_bytes_fetched_counter = monitoring::Counter<0>::New(
     "/tensorflow/data/bytes_fetched",
     "The number of bytes fetched from tf.data Dataset iterator.");
 
+auto* tf_data_getnext_duration_counter = monitoring::Sampler<0>::New(
+    {"/tensorflow/data/getnext_duration",
+     "Microseconds spent fetching an element from tf.data Dataset iterator."},
+    // Power of 2 with bucket count 14 (256G)
+    {monitoring::Buckets::Exponential(1, 4, 20)});
+
 auto* tf_data_elements_counter = monitoring::Counter<1>::New(
     "/tensorflow/data/elements", "tf.data elements", "name");
 
@@ -134,6 +140,12 @@ void RecordTFDataBytesFetched(int64 num_bytes) {
   tf_data_bytes_fetched_counter->GetCell()->IncrementBy(num_bytes);
 }
 
+void RecordTFDataGetNextDuration(uint64 duration_us) {
+  static auto* tfdata_getnext_duration_cell =
+      tf_data_getnext_duration_counter->GetCell();
+  tfdata_getnext_duration_cell->Add(duration_us);
+}
+
 void RecordTFDataElements(const string& name, int64 num_elements) {
   tf_data_elements_counter->GetCell(name)->IncrementBy(num_elements);
 }
diff --git a/tensorflow/core/common_runtime/metrics.h b/tensorflow/core/common_runtime/metrics.h
index b208ff2e3be..963a12e9865 100644
--- a/tensorflow/core/common_runtime/metrics.h
+++ b/tensorflow/core/common_runtime/metrics.h
@@ -35,6 +35,9 @@ void RecordTFDataBytesRead(const string& name, int64 num_bytes);
 // Records the number of bytes fetched from tf.data.Dataset iterator.
 void RecordTFDataBytesFetched(int64 num_bytes);
 
+// Records the time spent in ItertatorResource::GetNext() in microseconds.
+void RecordTFDataGetNextDuration(uint64 duration_us);
+
 // Records the number of elements produced by a tf.data.Dataset.
 //
 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index b74dcf55419..7a1f12b044a 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -91,8 +91,11 @@ Status IteratorResource::GetNext(OpKernelContext* ctx,
         [cm = params.cancellation_manager]() { cm->StartCancel(); },
         &deregister_fn));
     auto cleanup = gtl::MakeCleanup(std::move(deregister_fn));
+    uint64 start_time_us = ctx->env()->NowMicros();
     auto val = captured_state->iterator->GetNext(
         IteratorContext(std::move(params)), out_tensors, end_of_sequence);
+    metrics::RecordTFDataGetNextDuration(ctx->env()->NowMicros() -
+                                         start_time_us);
     metrics::RecordTFDataBytesFetched(GetTotalBytes(*out_tensors));
     return val;
   }

From aa5956dc18f65027bc28c8be132505cf9859d328 Mon Sep 17 00:00:00 2001
From: Alex Stark <starka@google.com>
Date: Tue, 18 Feb 2020 09:51:49 -0800
Subject: [PATCH 130/442] Depthwise convolution 3x3 per-channel int8 for
 dot-product ARM (16).

Invoke new dot-product ASM path in normal per-channel flow.

PiperOrigin-RevId: 295755806
Change-Id: Ief16e2acd78d2bbb9c5ced91f7a0312681d833fe
---
 .../depthwiseconv_uint8_3x3_filter.h          | 14 +++++
 .../optimized/integer_ops/depthwise_conv.h    | 53 ++++++++++++++++---
 .../internal/optimized/legacy_optimized_ops.h |  6 ++-
 3 files changed, 63 insertions(+), 10 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index ff19d8282f3..3dc863dcccd 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -13405,6 +13405,20 @@ inline void DepthwiseConvDotProduct3x3(
       thread_dim);
 }
 
+template <DepthwiseConvImplementation implementation>
+inline void DepthwiseConvDotProduct3x3PerChannel(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const int8* input_data, const RuntimeShape& filter_shape,
+    const int8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape, int8* output_data,
+    int thread_start, int thread_end, int thread_dim) {
+  DepthwiseConvDotProduct3x3Impl<
+      implementation, depthwise_conv::QuantizationType::kPerChannelInt8>(
+      params, input_shape, input_data, filter_shape, filter_data, bias_shape,
+      bias_data, output_shape, output_data, thread_start, thread_end,
+      thread_dim);
+}
+
 #undef vst1_lane_8x4
 #undef vst1q_lane_8x4
 #undef vld1q_lane_s8x8
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
index fd51647c9cf..4745003b5ea 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
@@ -1789,7 +1790,8 @@ inline void DepthwiseConvWithRounding(
     const int8* input_data, const RuntimeShape& filter_shape,
     const int8* filter_data, const RuntimeShape& bias_shape,
     const int32* bias_data, const RuntimeShape& output_shape, int8* output_data,
-    int thread_start, int thread_end, int thread_dim) {
+    int thread_start, int thread_end, int thread_dim,
+    const CpuBackendContext& cpu_backend_context) {
   ruy::profiler::ScopeLabel label("DepthwiseConvInt8/8bit");
   const int depth_multiplier = params.depth_multiplier;
   const int dilation_width_factor = params.dilation_width_factor;
@@ -1807,6 +1809,36 @@ inline void DepthwiseConvWithRounding(
 // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
 // Jetson TX-2. This compiler does not support the offsetof() macro.
 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
+#if defined(__ANDROID__) && defined(__clang__)
+  ruy::Context* ruy_context = cpu_backend_context.ruy_context();
+  const auto ruy_paths = ruy_context != nullptr
+                             ? ruy_context->GetRuntimeEnabledPaths()
+                             : ruy::Path::kNone;
+  const bool has_dot_product_instructions =
+      (ruy_paths & ruy::Path::kNeonDotprod) != ruy::Path::kNone;
+
+  // Dispatch to dot-product 3x3 kernels when supported.
+  if (has_dot_product_instructions) {
+    using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
+    DotProduct3x3KernelType kernel_type =
+        optimized_ops::depthwise_conv::CategorizeDotProductKernel<
+            optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>(
+            input_shape, filter_shape, output_shape, params);
+    if (kernel_type != DotProduct3x3KernelType::kNone) {
+      ruy::profiler::ScopeLabel specialized_label(
+          "DepthwiseConvInt8/8bit/3x3XDotProduct");
+      optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3PerChannel<
+          DepthwiseConvImplementation::kUseNeon3x3DotProduct>(
+          params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, output_data, thread_start,
+          thread_end, thread_dim);
+      return;
+    }
+  }
+
+#endif
+  // Dispatch to non-dot-product 3x3 kernels when supported.
+
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
   const int pad_width = params.padding_values.width;
@@ -1842,11 +1874,12 @@ inline void DepthwiseConvImpl(
     const int8* input_data, const RuntimeShape& filter_shape,
     const int8* filter_data, const RuntimeShape& bias_shape,
     const int32* bias_data, const RuntimeShape& output_shape, int8* output_data,
-    int thread_start, int thread_end, int thread_dim) {
+    int thread_start, int thread_end, int thread_dim,
+    const CpuBackendContext& cpu_backend_context) {
   return DepthwiseConvWithRounding<DepthwiseConvOutputRounding::kAwayFromZero>(
       params, output_multiplier, output_shift, input_shape, input_data,
       filter_shape, filter_data, bias_shape, bias_data, output_shape,
-      output_data, thread_start, thread_end, thread_dim);
+      output_data, thread_start, thread_end, thread_dim, cpu_backend_context);
 }
 
 template <typename T, typename TS>
@@ -1859,7 +1892,8 @@ struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task {
                           const T* filter_data, const RuntimeShape& bias_shape,
                           const TS* bias_data, const RuntimeShape& output_shape,
                           T* output_data, int thread_start, int thread_end,
-                          int thread_dim)
+                          int thread_dim,
+                          const CpuBackendContext& cpu_backend_context_x)
       : params_(params),
         output_multiplier_(output_multiplier),
         output_shift_(output_shift),
@@ -1873,13 +1907,14 @@ struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task {
         output_data_(output_data),
         thread_start_(thread_start),
         thread_end_(thread_end),
-        thread_dim_(thread_dim) {}
+        thread_dim_(thread_dim),
+        cpu_backend_context(cpu_backend_context_x) {}
 
   void Run() override {
     DepthwiseConvImpl(params_, output_multiplier_, output_shift_, input_shape_,
                       input_data_, filter_shape_, filter_data_, bias_shape_,
                       bias_data_, output_shape_, output_data_, thread_start_,
-                      thread_end_, thread_dim_);
+                      thread_end_, thread_dim_, cpu_backend_context);
   }
 
  private:
@@ -1897,6 +1932,7 @@ struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task {
   int thread_start_;
   int thread_end_;
   int thread_dim_;
+  const CpuBackendContext& cpu_backend_context;
 };
 
 inline int HowManyConvThreads(const RuntimeShape& output_shape,
@@ -1947,7 +1983,8 @@ inline void DepthwiseConvPerChannel(
     DepthwiseConvImpl(params, output_multiplier, output_shift, input_shape,
                       input_data, filter_shape, filter_data, bias_shape,
                       bias_data, output_shape, output_data, /*thread_start=*/0,
-                      /*thread_end=*/output_rows, /*thread_dim=*/1);
+                      /*thread_end=*/output_rows, /*thread_dim=*/1,
+                      *cpu_backend_context);
   } else {
     std::vector<DepthwiseConvWorkerTask<int8, int32>> tasks;
     // TODO(b/131746020) don't create new heap allocations every time.
@@ -1960,7 +1997,7 @@ inline void DepthwiseConvPerChannel(
       tasks.emplace_back(params, output_multiplier, output_shift, input_shape,
                          input_data, filter_shape, filter_data, bias_shape,
                          bias_data, output_shape, output_data, thread_start,
-                         thread_end, thread_dim);
+                         thread_end, thread_dim, *cpu_backend_context);
       thread_start = thread_end;
     }
     cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
diff --git a/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
index da612804253..325498b3f3f 100644
--- a/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
@@ -512,10 +512,11 @@ struct LegacyPerChannelDepthwiseConvWorkerTask : public gemmlowp::Task {
         thread_dim_(thread_dim) {}
 
   void Run() override {
+    CpuBackendContext backend_context;
     optimized_integer_ops::DepthwiseConvImpl(
         params_, output_multiplier_, output_shift_, input_shape_, input_data_,
         filter_shape_, filter_data_, bias_shape_, bias_data_, output_shape_,
-        output_data_, thread_start_, thread_end_, thread_dim_);
+        output_data_, thread_start_, thread_end_, thread_dim_, backend_context);
   }
 
  private:
@@ -568,11 +569,12 @@ inline void DepthwiseConvPerChannel(
   thread_count = std::max(1, std::min(thread_count, max_threads));
 
   if (thread_count == 1) {
+    CpuBackendContext backend_context;
     optimized_integer_ops::DepthwiseConvImpl(
         params, output_multiplier, output_shift, input_shape, input_data,
         filter_shape, filter_data, bias_shape, bias_data, output_shape,
         output_data, /*thread_start=*/0,
-        /*thread_end=*/output_rows, /*thread_dim=*/1);
+        /*thread_end=*/output_rows, /*thread_dim=*/1, backend_context);
   } else {
     std::vector<gemmlowp::Task*> tasks(thread_count);
     int thread_start = 0;

From e764a2f7f8f3ce472002c6822d3d7ac66783f0ea Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <prakalps@google.com>
Date: Tue, 18 Feb 2020 09:54:57 -0800
Subject: [PATCH 131/442] Add import support for HLO Sort op.

PiperOrigin-RevId: 295756502
Change-Id: I9574a93212d3bed7ba344ae407604e394c8599ac
---
 .../mlir/xla/hlo_function_importer.cc         | 10 ++++++++++
 .../mlir/xla/tests/translate/import.hlotxt    | 19 +++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
index 545bcb4f44f..6081f2e1461 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
@@ -385,6 +385,16 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
               ConvertDimensions(instruction->slice_strides()))
           .getOperation();
     }
+    case HloOpcode::kSort: {
+      auto sort_instruction = static_cast<HloSortInstruction*>(instruction);
+      auto sort_op = func_builder->create<mlir::xla_hlo::SortOp>(
+          loc, result_type, operands,
+          builder_->getI64IntegerAttr(sort_instruction->sort_dimension()),
+          builder_->getBoolAttr(sort_instruction->is_stable()));
+      TF_RETURN_IF_ERROR(ImportComputation(sort_instruction->to_apply(),
+                                           &sort_op.comparator()));
+      return sort_op.getOperation();
+    }
     case HloOpcode::kConditional: {
       llvm::SmallVector<Type, 4> rets;
       TF_RETURN_IF_ERROR(GetMlirTypes(
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
index b9f88ef699c..a02db66cd47 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
@@ -743,6 +743,25 @@ ENTRY %dummy_main (Arg_0.1: f32[]) -> f32[] {
   ROOT %sine.3 = f32[1,16,16,3]{3,2,1,0} sine(f32[1,16,16,3]{3,2,1,0} %arg0.1)
 }
 
+// Test sort
+%compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+}
+
+%test_sort {
+  x = f32[1024]{0} parameter(0)
+  ROOT sorted = f32[1024]{0} sort(x), dimensions={0}, is_stable=true, to_apply=compare
+}
+// CHECK-LABEL:  func @test_sort
+// CHECK-SAME:  [[ARG:%.*]]: tensor<1024xf32>) -> tensor<1024xf32>
+// CHECK:  "xla_hlo.sort"([[ARG]]) ( {
+// CHECK:    ^bb0([[ARG0:%.*]]: tensor<f32>, [[ARG1:%.*]]: tensor<f32>):
+// CHECK:      [[CMP:%.*]] = "xla_hlo.compare"([[ARG0]], [[ARG1]]) {comparison_direction = "LT", name = "lt"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+// CHECK:      "xla_hlo.return"([[CMP]]) : (tensor<i1>) -> ()
+// CHECK:    }) {dimension = 0 : i64, is_stable = true} : (tensor<1024xf32>) -> tensor<1024xf32>
+
 // CHECK-LABEL:  func @test_subtract
 %test_subtract (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f32[4] {
   %Arg_0.1 = f32[4] parameter(0)

From 1319d5a1544475b8fdebe92948e24fed46498da5 Mon Sep 17 00:00:00 2001
From: Jakob Buchgraber <buchgr@google.com>
Date: Tue, 18 Feb 2020 10:07:14 -0800
Subject: [PATCH 132/442] python_configure: delete dead code PiperOrigin-RevId:
 295759695 Change-Id: Id45711d7ba82b5b5d3862399c6bb32833beb4ad5

---
 third_party/py/python_configure.bzl | 103 ----------------------------
 1 file changed, 103 deletions(-)

diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index 2995564c1d1..bbeaa46f332 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -22,109 +22,6 @@ load(
     "read_dir",
 )
 
-def _which(repository_ctx, program_name):
-    """Returns the full path to a program on the execution platform."""
-    if _is_windows(repository_ctx):
-        if not program_name.endswith(".exe"):
-            program_name = program_name + ".exe"
-        result = _execute(repository_ctx, ["where.exe", program_name])
-    else:
-        result = _execute(repository_ctx, ["which", program_name])
-    return result.stdout.rstrip()
-
-def _get_environ(repository_ctx, name, default_value = None):
-    """Returns the value of an environment variable on the execution platform."""
-    if _is_windows(repository_ctx):
-        result = _execute(
-            repository_ctx,
-            ["cmd.exe", "/c", "echo", "%" + name + "%"],
-            empty_stdout_fine = True,
-        )
-    else:
-        cmd = "echo -n \"$%s\"" % name
-        result = _execute(
-            repository_ctx,
-            [get_bash_bin(repository_ctx), "-c", cmd],
-            empty_stdout_fine = True,
-        )
-    if len(result.stdout) == 0:
-        return default_value
-    return result.stdout
-
-def _get_host_environ(repository_ctx, name):
-    return repository_ctx.os.environ.get(name)
-
-def _fail(msg):
-    """Output failure message when auto configuration fails."""
-    red = "\033[0;31m"
-    no_color = "\033[0m"
-    fail("%sPython Configuration Error:%s %s\n" % (red, no_color, msg))
-
-def _is_windows(repository_ctx):
-    """Returns true if the execution platform is windows."""
-
-    os_name = ""
-    if hasattr(repository_ctx.attr, "exec_properties") and "OSFamily" in repository_ctx.attr.exec_properties:
-        os_name = repository_ctx.attr.exec_properties["OSFamily"]
-    else:
-        os_name = repository_ctx.os.name
-
-    return os_name.lower().find("windows") != -1
-
-def _execute(
-        repository_ctx,
-        cmdline,
-        error_msg = None,
-        error_details = None,
-        empty_stdout_fine = False):
-    """Executes an arbitrary shell command.
-
-    Args:
-      repository_ctx: the repository_ctx object
-      cmdline: list of strings, the command to execute
-      error_msg: string, a summary of the error if the command fails
-      error_details: string, details about the error or steps to fix it
-      empty_stdout_fine: bool, if True, an empty stdout result is fine, otherwise
-        it's an error
-    Return:
-      the result of repository_ctx.execute(cmdline)
-    """
-    result = repository_ctx.execute(cmdline)
-    if result.stderr or not (empty_stdout_fine or result.stdout):
-        _fail("\n".join([
-            error_msg.strip() if error_msg else "Repository command failed",
-            result.stderr.strip(),
-            error_details if error_details else "",
-        ]))
-    return result
-
-def _read_dir(repository_ctx, src_dir):
-    """Returns a string with all files in a directory.
-
-    Finds all files inside a directory, traversing subfolders and following
-    symlinks. The returned string contains the full path of all files
-    separated by line breaks.
-    """
-    if _is_windows(repository_ctx):
-        src_dir = src_dir.replace("/", "\\")
-        find_result = _execute(
-            repository_ctx,
-            ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
-            empty_stdout_fine = True,
-        )
-
-        # src_files will be used in genrule.outs where the paths must
-        # use forward slashes.
-        result = find_result.stdout.replace("\\", "/")
-    else:
-        find_result = _execute(
-            repository_ctx,
-            ["find", src_dir, "-follow", "-type", "f"],
-            empty_stdout_fine = True,
-        )
-        result = find_result.stdout
-    return result
-
 def _genrule(src_dir, genrule_name, command, outs):
     """Returns a string with a genrule.
 

From 9211d97305bf6f782b12f8f35deca0b90d61d448 Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Tue, 18 Feb 2020 10:13:26 -0800
Subject: [PATCH 133/442] Now that tensorflow_core is gone, point includes and
 sysconfig at tensorflow pkg.

As of pypi nightly 20200215, the includes/ directory in the tensorflow{,_core}
site-packages is missing/incomplete.

This is due to the removal of the virtual tensorflow pointing to tensorflow_core
package but without updating sysconfig.py or the seutp.py/MANIFEST.in.

This CL fixes that.

PiperOrigin-RevId: 295761153
Change-Id: I51e21dbf40f4c9b54a98978cfa3e0b5fbcb4bc61
---
 tensorflow/python/platform/sysconfig.py  |  8 +++----
 tensorflow/tools/pip_package/MANIFEST.in | 20 ++++++++--------
 tensorflow/tools/pip_package/setup.py    | 30 +++++++++++-------------
 3 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/tensorflow/python/platform/sysconfig.py b/tensorflow/python/platform/sysconfig.py
index 71ca2867fef..721ad99c60a 100644
--- a/tensorflow/python/platform/sysconfig.py
+++ b/tensorflow/python/platform/sysconfig.py
@@ -36,10 +36,10 @@ def get_include():
     The directory as string.
   """
   # Import inside the function.
-  # sysconfig is imported from the tensorflow_core module, so having this
+  # sysconfig is imported from the tensorflow module, so having this
   # import at the top would cause a circular import, resulting in
-  # the tensorflow_core module missing symbols that come after sysconfig.
-  import tensorflow_core as tf
+  # the tensorflow module missing symbols that come after sysconfig.
+  import tensorflow as tf
   return _os_path.join(_os_path.dirname(tf.__file__), 'include')
 
 
@@ -50,7 +50,7 @@ def get_lib():
   Returns:
     The directory as string.
   """
-  import tensorflow_core as tf
+  import tensorflow as tf
   return _os_path.join(_os_path.dirname(tf.__file__))
 
 
diff --git a/tensorflow/tools/pip_package/MANIFEST.in b/tensorflow/tools/pip_package/MANIFEST.in
index b83fcabfa93..41652b1311a 100644
--- a/tensorflow/tools/pip_package/MANIFEST.in
+++ b/tensorflow/tools/pip_package/MANIFEST.in
@@ -9,14 +9,14 @@ recursive-include * *.dylib
 recursive-include * *.dll
 recursive-include * *.lib
 recursive-include * *.csv
-recursive-include tensorflow_core/include/tensorflow *.h
-recursive-include tensorflow_core/include/tensorflow *.proto
-recursive-include tensorflow_core/include/Eigen *
-recursive-include tensorflow_core/include/absl *
-recursive-include tensorflow_core/include/external *
-recursive-include tensorflow_core/include/google *.h
-recursive-include tensorflow_core/include/google *.inc
-recursive-include tensorflow_core/include/include *.h
-recursive-include tensorflow_core/include/third_party *
-recursive-include tensorflow_core/include/unsupported *
+recursive-include tensorflow/include/tensorflow *.h
+recursive-include tensorflow/include/tensorflow *.proto
+recursive-include tensorflow/include/Eigen *
+recursive-include tensorflow/include/absl *
+recursive-include tensorflow/include/external *
+recursive-include tensorflow/include/google *.h
+recursive-include tensorflow/include/google *.inc
+recursive-include tensorflow/include/include *.h
+recursive-include tensorflow/include/third_party *
+recursive-include tensorflow/include/unsupported *
 
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 6e0576102dc..55972e1d4ca 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -143,7 +143,7 @@ class InstallCommand(InstallCommandBase):
 
   def finalize_options(self):
     ret = InstallCommandBase.finalize_options(self)
-    self.install_headers = os.path.join(self.install_purelib, 'tensorflow_core',
+    self.install_headers = os.path.join(self.install_purelib, 'tensorflow',
                                         'include')
     self.install_lib = self.install_platlib
     return ret
@@ -181,17 +181,15 @@ class InstallHeaders(Command):
     # Get rid of some extra intervening directories so we can have fewer
     # directories for -I
     install_dir = re.sub('/google/protobuf_archive/src', '', install_dir)
-    install_dir = re.sub('/include/tensorflow_core/', '/include/tensorflow/',
-                         install_dir)
 
-    # Copy external code headers into tensorflow_core/include.
+    # Copy external code headers into tensorflow/include.
     # A symlink would do, but the wheel file that gets created ignores
     # symlink within the directory hierarchy.
     # NOTE(keveman): Figure out how to customize bdist_wheel package so
     # we can do the symlink.
     external_header_locations = [
-        'tensorflow_core/include/external/eigen_archive/',
-        'tensorflow_core/include/external/com_google_absl/',
+        'tensorflow/include/external/eigen_archive/',
+        'tensorflow/include/external/com_google_absl/',
     ]
     for location in external_header_locations:
       if location in install_dir:
@@ -245,20 +243,20 @@ else:
   EXTENSION_NAME = 'python/_pywrap_tensorflow_internal.so'
 
 headers = (
-    list(find_files('*.proto', 'tensorflow_core/compiler')) +
-    list(find_files('*.proto', 'tensorflow_core/core')) +
-    list(find_files('*.proto', 'tensorflow_core/python')) +
-    list(find_files('*.h', 'tensorflow_core/compiler')) +
-    list(find_files('*.h', 'tensorflow_core/core')) +
-    list(find_files('*.h', 'tensorflow_core/python')) +
-    list(find_files('*.h', 'tensorflow_core/stream_executor')) +
+    list(find_files('*.proto', 'tensorflow/compiler')) +
+    list(find_files('*.proto', 'tensorflow/core')) +
+    list(find_files('*.proto', 'tensorflow/python')) +
+    list(find_files('*.h', 'tensorflow/compiler')) +
+    list(find_files('*.h', 'tensorflow/core')) +
+    list(find_files('*.h', 'tensorflow/python')) +
+    list(find_files('*.h', 'tensorflow/stream_executor')) +
     list(find_files('*.h', 'google/com_google_protobuf/src')) +
     list(find_files('*.inc', 'google/com_google_protobuf/src')) +
     list(find_files('*', 'third_party/eigen3')) + list(
-        find_files('*.h', 'tensorflow_core/include/external/com_google_absl')) +
+        find_files('*.h', 'tensorflow/include/external/com_google_absl')) +
     list(
-        find_files('*.inc', 'tensorflow_core/include/external/com_google_absl'))
-    + list(find_files('*', 'tensorflow_core/include/external/eigen_archive')))
+        find_files('*.inc', 'tensorflow/include/external/com_google_absl'))
+    + list(find_files('*', 'tensorflow/include/external/eigen_archive')))
 
 setup(
     name=project_name,

From 884a14ac9ad247e1cb020b66f37a62e49f0fa406 Mon Sep 17 00:00:00 2001
From: Jakob Buchgraber <buchgr@google.com>
Date: Tue, 18 Feb 2020 10:29:11 -0800
Subject: [PATCH 134/442] cuda_configure: make find_libs() compatible with
 remote execution

This change moves the logic of _find_cuda_lib() to check_cuda_libs.py. Instead of invoking _find_cuda_lib() once per library we now invoke check_cuda_libs.py once with a list of all libraries to look for as arguments.

For Example:
python check_cuda_libs.py /usr/local/cuda/lib64/libcudart.so.10.1 True /usr/local/cuda/lib64/libcudart_static.a False

PiperOrigin-RevId: 295765176
Change-Id: I743770ff640d009272f62c4ed5a89044b5343972
---
 third_party/gpus/check_cuda_libs.py |  89 ++++++++++++++++++
 third_party/gpus/cuda_configure.bzl | 141 +++++++++++++---------------
 2 files changed, 153 insertions(+), 77 deletions(-)
 create mode 100644 third_party/gpus/check_cuda_libs.py

diff --git a/third_party/gpus/check_cuda_libs.py b/third_party/gpus/check_cuda_libs.py
new file mode 100644
index 00000000000..b7b36e6466e
--- /dev/null
+++ b/third_party/gpus/check_cuda_libs.py
@@ -0,0 +1,89 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Verifies that a list of libraries is installed on the system.
+
+Takes a a list of arguments with every two subsequent arguments being a logical
+tuple of (path, check_soname). The path to the library and either True or False
+to indicate whether to check the soname field on the shared library.
+
+Example Usage:
+./check_cuda_libs.py /path/to/lib1.so True /path/to/lib2.so False
+"""
+import os
+import os.path
+import platform
+import subprocess
+import sys
+
+# pylint: disable=g-import-not-at-top,g-importing-member
+try:
+  from shutil import which
+except ImportError:
+  from distutils.spawn import find_executable as which
+# pylint: enable=g-import-not-at-top,g-importing-member
+
+
+class ConfigError(Exception):
+  pass
+
+
+def _is_windows():
+  return platform.system() == "Windows"
+
+
+def check_cuda_lib(path, check_soname=True):
+  """Tests if a library exists on disk and whether its soname matches the filename.
+
+  Args:
+    path: the path to the library.
+    check_soname: whether to check the soname as well.
+
+  Raises:
+    ConfigError: If the library does not exist or if its soname does not match
+    the filename.
+  """
+  if not os.path.isfile(path):
+    raise ConfigError("No library found under: " + path)
+  objdump = which("objdump")
+  if check_soname and objdump is not None and not _is_windows():
+    # Decode is necessary as in py3 the return type changed from str to bytes
+    output = subprocess.check_output([objdump, "-p", path]).decode("ascii")
+    output = [line for line in output.splitlines() if "SONAME" in line]
+    sonames = [line.strip().split(" ")[-1] for line in output]
+    if not any([soname == os.path.basename(path) for soname in sonames]):
+      raise ConfigError("None of the libraries match their SONAME: " + path)
+
+
+def main():
+  try:
+    args = [argv for argv in sys.argv[1:]]
+    if len(args) % 2 == 1:
+      raise ConfigError("Expected even number of arguments")
+    checked_paths = []
+    for i in range(0, len(args), 2):
+      path = args[i]
+      check_cuda_lib(path, check_soname=args[i + 1] == "True")
+      checked_paths.append(path)
+    # pylint: disable=superfluous-parens
+    print(os.linesep.join(checked_paths))
+    # pylint: enable=superfluous-parens
+  except ConfigError as e:
+    sys.stderr.write(str(e))
+    sys.exit(1)
+
+
+if __name__ == "__main__":
+  main()
+
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index c15f3c08189..6fbe306457f 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -40,6 +40,7 @@ load(
 load(
     "//third_party/remote_config:common.bzl",
     "err_out",
+    "execute",
     "get_bash_bin",
     "get_cpu_value",
     "get_python_bin",
@@ -447,67 +448,46 @@ def lib_name(base_name, cpu_value, version = None, static = False):
     else:
         auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
 
-def find_lib(repository_ctx, paths, check_soname = True):
-    """
-      Finds a library among a list of potential paths.
-
-      Args:
-        paths: List of paths to inspect.
-
-      Returns:
-        Returns the first path in paths that exist.
-    """
-    objdump = repository_ctx.which("objdump")
-    mismatches = []
-    for path in [repository_ctx.path(path) for path in paths]:
-        if not path.exists:
-            continue
-        if check_soname and objdump != None and not is_windows(repository_ctx):
-            output = raw_exec(repository_ctx, [objdump, "-p", str(path)]).stdout
-            output = [line for line in output.splitlines() if "SONAME" in line]
-            sonames = [line.strip().split(" ")[-1] for line in output]
-            if not any([soname == path.basename for soname in sonames]):
-                mismatches.append(str(path))
-                continue
-        return str(path)
-    if mismatches:
-        auto_configure_fail(
-            "None of the libraries match their SONAME: " + ", ".join(mismatches),
-        )
-    auto_configure_fail("No library found under: " + ", ".join(paths))
-
-def _find_cuda_lib(
-        lib,
-        repository_ctx,
-        cpu_value,
-        basedir,
-        version,
-        static = False):
-    """Finds the given CUDA or cuDNN library on the system.
-
-      Args:
-        lib: The name of the library, such as "cudart"
-        repository_ctx: The repository context.
-        cpu_value: The name of the host operating system.
-        basedir: The install directory of CUDA or cuDNN.
-        version: The version of the library.
-        static: True if static library, False if shared object.
-
-      Returns:
-        Returns the path to the library.
-      """
+def _lib_path(lib, cpu_value, basedir, version, static):
     file_name = lib_name(lib, cpu_value, version, static)
-    return find_lib(
-        repository_ctx,
-        ["%s/%s" % (basedir, file_name)],
-        check_soname = version and not static,
+    return "%s/%s" % (basedir, file_name)
+
+def _should_check_soname(version, static):
+    return version and not static
+
+def _check_cuda_lib_params(lib, cpu_value, basedir, version, static = False):
+    return (
+        _lib_path(lib, cpu_value, basedir, version, static),
+        _should_check_soname(version, static),
     )
 
-def _find_libs(repository_ctx, cuda_config):
+def _check_cuda_libs(repository_ctx, script_path, libs):
+    python_bin = get_python_bin(repository_ctx)
+    contents = repository_ctx.read(script_path).splitlines()
+
+    cmd = "from os import linesep;"
+    cmd += "f = open('script.py', 'w');"
+    for line in contents:
+        cmd += "f.write('%s' + linesep);" % line
+    cmd += "f.close();"
+    cmd += "from os import system;"
+    args = " ".join([path + " " + str(check) for path, check in libs])
+    cmd += "system('%s script.py %s');" % (python_bin, args)
+
+    all_paths = [path for path, _ in libs]
+    checked_paths = execute(repository_ctx, [python_bin, "-c", cmd]).stdout.splitlines()
+    if all_paths != checked_paths:
+        auto_configure_fail("Error with installed CUDA libs. Expected '%s'. Actual '%s'." % (all_paths, checked_paths))
+
+def _find_libs(repository_ctx, check_cuda_libs_script, cuda_config):
     """Returns the CUDA and cuDNN libraries on the system.
 
+      Also, verifies that the script actually exist.
+
       Args:
         repository_ctx: The repository context.
+        check_cuda_libs_script: The path to a script verifying that the cuda
+          libraries exist on the system.
         cuda_config: The CUDA config as returned by _get_cuda_config
 
       Returns:
@@ -515,80 +495,86 @@ def _find_libs(repository_ctx, cuda_config):
       """
     cpu_value = cuda_config.cpu_value
     stub_dir = "" if is_windows(repository_ctx) else "/stubs"
-    return {
-        "cuda": _find_cuda_lib(
+
+    check_cuda_libs_params = {
+        "cuda": _check_cuda_lib_params(
             "cuda",
-            repository_ctx,
             cpu_value,
             cuda_config.config["cuda_library_dir"] + stub_dir,
-            None,
+            version = None,
+            static = False,
         ),
-        "cudart": _find_cuda_lib(
+        "cudart": _check_cuda_lib_params(
             "cudart",
-            repository_ctx,
             cpu_value,
             cuda_config.config["cuda_library_dir"],
             cuda_config.cuda_version,
+            static = False,
         ),
-        "cudart_static": _find_cuda_lib(
+        "cudart_static": _check_cuda_lib_params(
             "cudart_static",
-            repository_ctx,
             cpu_value,
             cuda_config.config["cuda_library_dir"],
             cuda_config.cuda_version,
             static = True,
         ),
-        "cublas": _find_cuda_lib(
+        "cublas": _check_cuda_lib_params(
             "cublas",
-            repository_ctx,
             cpu_value,
             cuda_config.config["cublas_library_dir"],
             cuda_config.cuda_lib_version,
+            static = False,
         ),
-        "cusolver": _find_cuda_lib(
+        "cusolver": _check_cuda_lib_params(
             "cusolver",
-            repository_ctx,
             cpu_value,
             cuda_config.config["cuda_library_dir"],
             cuda_config.cuda_lib_version,
+            static = False,
         ),
-        "curand": _find_cuda_lib(
+        "curand": _check_cuda_lib_params(
             "curand",
-            repository_ctx,
             cpu_value,
             cuda_config.config["cuda_library_dir"],
             cuda_config.cuda_lib_version,
+            static = False,
         ),
-        "cufft": _find_cuda_lib(
+        "cufft": _check_cuda_lib_params(
             "cufft",
-            repository_ctx,
             cpu_value,
             cuda_config.config["cuda_library_dir"],
             cuda_config.cuda_lib_version,
+            static = False,
         ),
-        "cudnn": _find_cuda_lib(
+        "cudnn": _check_cuda_lib_params(
             "cudnn",
-            repository_ctx,
             cpu_value,
             cuda_config.config["cudnn_library_dir"],
             cuda_config.cudnn_version,
+            static = False,
         ),
-        "cupti": _find_cuda_lib(
+        "cupti": _check_cuda_lib_params(
             "cupti",
-            repository_ctx,
             cpu_value,
             cuda_config.config["cupti_library_dir"],
             cuda_config.cuda_version,
+            static = False,
         ),
-        "cusparse": _find_cuda_lib(
+        "cusparse": _check_cuda_lib_params(
             "cusparse",
-            repository_ctx,
             cpu_value,
             cuda_config.config["cuda_library_dir"],
             cuda_config.cuda_lib_version,
+            static = False,
         ),
     }
 
+    # Verify that the libs actually exist at their locations.
+    _check_cuda_libs(repository_ctx, check_cuda_libs_script, check_cuda_libs_params.values())
+
+    paths = {filename: v[0] for (filename, v) in check_cuda_libs_params.items()}
+    return paths
+
 def _cudart_static_linkopt(cpu_value):
     """Returns additional platform-specific linkopts for cudart."""
     return "" if cpu_value == "Darwin" else "\"-lrt\","
@@ -924,7 +910,8 @@ def _create_local_cuda_repository(repository_ctx):
         ],
     ))
 
-    cuda_libs = _find_libs(repository_ctx, cuda_config)
+    check_cuda_libs_script = repository_ctx.path(Label("@org_tensorflow//third_party/gpus:check_cuda_libs.py"))
+    cuda_libs = _find_libs(repository_ctx, check_cuda_libs_script, cuda_config)
     cuda_lib_srcs = []
     cuda_lib_outs = []
     for path in cuda_libs.values():

From b04371bc952f9f9668e862d82db71651fdef8dc6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 10:37:24 -0800
Subject: [PATCH 135/442] Apply a sequence mask for the gradient in
 ctc_loss_dense.

PiperOrigin-RevId: 295767405
Change-Id: I80a53508288cdc505f876901fde5fa46a7645bca
---
 tensorflow/python/kernel_tests/ctc_loss_op_test.py |  9 +++++----
 tensorflow/python/ops/ctc_ops.py                   | 11 +++++++++++
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/kernel_tests/ctc_loss_op_test.py b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
index 036cd8ed648..e7f1f8a5e85 100644
--- a/tensorflow/python/kernel_tests/ctc_loss_op_test.py
+++ b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
@@ -367,7 +367,8 @@ class CTCLossTestV2(test.TestCase):
       batch_size = 8
       num_labels = 6
       label_length = 5
-      num_frames = 12
+      minimum_logits_length = 10
+      num_frames = minimum_logits_length + batch_size
       logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
       labels = random_ops.random_uniform(
           [batch_size, label_length], minval=1, maxval=num_labels,
@@ -379,7 +380,7 @@ class CTCLossTestV2(test.TestCase):
           label_lengths, maxlen=label_length, dtype=label_lengths.dtype)
       labels *= label_mask
 
-      logit_lengths = [num_frames] * batch_size
+      logit_lengths = math_ops.range(batch_size) + minimum_logits_length
 
       ctc_loss = ctc_ops.ctc_loss_dense(
           labels=labels,
@@ -410,8 +411,8 @@ class CTCLossTestV2(test.TestCase):
           self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss]))
           self.assertAllClose(
               *self.evaluate([ctc_loss_grads, tf_nn_ctc_grads]),
-              rtol=2e-06,
-              atol=2e-06)
+              rtol=4e-06,
+              atol=4e-06)
 
   @test_util.run_v1_only("b/120545219")
   def testCtcLossDenseUniqueFastPathIsSameAsCtcLoss(self):
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index d0298fd8b6d..4b3a5dd7fe9 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -658,6 +658,17 @@ def ctc_loss_and_grad(logits, labels, label_length, logit_length, unique=None):
     olabel_log_probs = _state_to_olabel(labels, num_labels, fwd_bwd_log_probs)
 
   grad = math_ops.exp(ilabel_log_probs) - math_ops.exp(olabel_log_probs)
+
+  # Applies the sequence mask for the gradient. It is enough to appply the mask
+  # only for ilabel_log_probs because olabel_log_probs already consider the
+  # mask. However, it is just safe and clean to apply it for the gradient.
+  max_logit_length = _get_dim(logits, 0)
+  logit_mask = array_ops.sequence_mask(logit_length, max_logit_length,
+                                       dtypes.float32)
+  logit_mask = array_ops.transpose(logit_mask, perm=[1, 0])
+  logit_mask = array_ops.expand_dims(logit_mask, axis=2)
+  grad *= logit_mask
+
   loss = -log_likelihood
   return loss, grad
 

From dff4559ac3abca11bfad3400195b2f5a78420366 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Tue, 18 Feb 2020 10:42:39 -0800
Subject: [PATCH 136/442] [tf.data] Internal cleanup.

PiperOrigin-RevId: 295768875
Change-Id: I77da989a9eb2c74706e64bdc5e863d13fa76832a
---
 .../core/kernels/data/cache_dataset_ops_test.cc      |  8 ++++----
 tensorflow/core/kernels/data/dataset_test_base.cc    | 12 +++++++-----
 .../kernels/data/experimental/to_tf_record_op.cc     |  4 ++--
 tensorflow/core/kernels/data/iterator_ops.cc         | 11 ++++++-----
 .../core/kernels/data/shuffle_dataset_op_test.cc     |  4 ++--
 .../core/kernels/data/window_dataset_op_test.cc      |  3 ++-
 6 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/tensorflow/core/kernels/data/cache_dataset_ops_test.cc b/tensorflow/core/kernels/data/cache_dataset_ops_test.cc
index 9faf92b83da..c6bc70b4c94 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops_test.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops_test.cc
@@ -182,8 +182,8 @@ TEST_P(ParameterizedGetNextTest, GetNext) {
 
   // Test the read mode.
   TF_ASSERT_OK(dataset_->MakeIterator(
-      iterator_ctx_.get(), test_case.dataset_params.iterator_prefix(),
-      &iterator_));
+      iterator_ctx_.get(), /*parent=*/nullptr,
+      test_case.dataset_params.iterator_prefix(), &iterator_));
   end_of_sequence = false;
   out_tensors.clear();
   while (!end_of_sequence) {
@@ -322,8 +322,8 @@ TEST_P(ParameterizedIteratorSaveAndRestoreTest, SaveAndRestore) {
     end_of_sequence = false;
     out_tensors.clear();
     TF_ASSERT_OK(dataset_->MakeIterator(
-        iterator_ctx_.get(), test_case.dataset_params.iterator_prefix(),
-        &iterator_));
+        iterator_ctx_.get(), /*parent=*/nullptr,
+        test_case.dataset_params.iterator_prefix(), &iterator_));
   }
 
   std::unique_ptr<SerializationContext> serialization_ctx;
diff --git a/tensorflow/core/kernels/data/dataset_test_base.cc b/tensorflow/core/kernels/data/dataset_test_base.cc
index 38652753066..7c5d0c3f679 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.cc
+++ b/tensorflow/core/kernels/data/dataset_test_base.cc
@@ -654,8 +654,8 @@ Status DatasetOpsTestBase::CheckIteratorSaveAndRestore(
     const string& iterator_prefix, const std::vector<Tensor>& expected_outputs,
     const std::vector<int>& breakpoints, bool compare_order) {
   std::unique_ptr<IteratorBase> iterator;
-  TF_RETURN_IF_ERROR(
-      dataset_->MakeIterator(iterator_ctx_.get(), iterator_prefix, &iterator));
+  TF_RETURN_IF_ERROR(dataset_->MakeIterator(
+      iterator_ctx_.get(), /*parent=*/nullptr, iterator_prefix, &iterator));
   std::unique_ptr<SerializationContext> serialization_ctx;
   TF_RETURN_IF_ERROR(CreateSerializationContext(&serialization_ctx));
   bool end_of_sequence = false;
@@ -704,8 +704,9 @@ Status DatasetOpsTestBase::Initialize(const DatasetParams& dataset_params) {
   TF_RETURN_IF_ERROR(MakeDataset(dataset_params, &dataset_kernel_, &params_,
                                  &dataset_ctx_, &tensors_, &dataset_));
   TF_RETURN_IF_ERROR(CreateIteratorContext(dataset_ctx_.get(), &iterator_ctx_));
-  TF_RETURN_IF_ERROR(dataset_->MakeIterator(
-      iterator_ctx_.get(), dataset_params.iterator_prefix(), &iterator_));
+  TF_RETURN_IF_ERROR(
+      dataset_->MakeIterator(iterator_ctx_.get(), /*parent=*/nullptr,
+                             dataset_params.iterator_prefix(), &iterator_));
   initialized_ = true;
   return Status::OK();
 }
@@ -791,7 +792,8 @@ Status DatasetOpsTestBase::MakeIterator(
       CreateIteratorContext(dataset.op_kernel_context(), &iterator_ctx));
   std::unique_ptr<IteratorBase> iterator_base;
   TF_RETURN_IF_ERROR(dataset.dataset()->MakeIterator(
-      iterator_ctx.get(), dataset_params.iterator_prefix(), &iterator_base));
+      iterator_ctx.get(), /*parent=*/nullptr, dataset_params.iterator_prefix(),
+      &iterator_base));
   *iterator = std::make_unique<TestIterator>(std::move(iterator_ctx),
                                              std::move(iterator_base));
   return Status::OK();
diff --git a/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc b/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
index 1f7576cbc75..6a910145b53 100644
--- a/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
+++ b/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
@@ -84,8 +84,8 @@ class ToTFRecordOp : public AsyncOpKernel {
 
     IteratorContext iter_ctx(std::move(params));
     std::unique_ptr<IteratorBase> iterator;
-    TF_RETURN_IF_ERROR(
-        dataset->MakeIterator(&iter_ctx, "ToTFRecordOpIterator", &iterator));
+    TF_RETURN_IF_ERROR(dataset->MakeIterator(
+        &iter_ctx, /*parent=*/nullptr, "ToTFRecordOpIterator", &iterator));
 
     std::vector<Tensor> components;
     components.reserve(dataset->output_dtypes().size());
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 7a1f12b044a..4adf7f64fba 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -191,7 +191,8 @@ Status IteratorResource::SetIteratorFromDataset(OpKernelContext* ctx,
   {
     auto cleanup = gtl::MakeCleanup(std::move(deregister_fn));
     TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)),
-                                             "Iterator", &iterator));
+                                             /*parent=*/nullptr, "Iterator",
+                                             &iterator));
     TF_RETURN_IF_ERROR(
         VerifyTypesMatch(output_dtypes_, iterator->output_dtypes()));
     TF_RETURN_IF_ERROR(
@@ -565,8 +566,8 @@ class ToSingleElementOp : public HybridAsyncOpKernel {
 
     IteratorContext iter_ctx(std::move(params));
     std::unique_ptr<IteratorBase> iterator;
-    TF_RETURN_IF_ERROR(
-        dataset->MakeIterator(&iter_ctx, "SingleElementIterator", &iterator));
+    TF_RETURN_IF_ERROR(dataset->MakeIterator(
+        &iter_ctx, /*parent=*/nullptr, "SingleElementIterator", &iterator));
 
     std::vector<Tensor> components;
     components.reserve(dataset->output_dtypes().size());
@@ -636,8 +637,8 @@ class ReduceDatasetOp : public HybridAsyncOpKernel {
         captured_func->Instantiate(&iter_ctx, &instantiated_captured_func));
 
     std::unique_ptr<IteratorBase> iterator;
-    TF_RETURN_IF_ERROR(
-        dataset->MakeIterator(&iter_ctx, "ReduceIterator", &iterator));
+    TF_RETURN_IF_ERROR(dataset->MakeIterator(&iter_ctx, /*parent=*/nullptr,
+                                             "ReduceIterator", &iterator));
 
     // Iterate through the input dataset.
     while (true) {
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc b/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc
index 20fb2912f5b..ca9afce7fc1 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc
@@ -344,8 +344,8 @@ TEST_P(ParameterizedGetNextTest, GetNext) {
   // Reshuffle the dataset.
   end_of_sequence = false;
   TF_ASSERT_OK(dataset_->MakeIterator(
-      iterator_ctx_.get(), test_case.dataset_params.iterator_prefix(),
-      &iterator_));
+      iterator_ctx_.get(), /*parent=*/nullptr,
+      test_case.dataset_params.iterator_prefix(), &iterator_));
   std::vector<Tensor> reshuffled_out_tensors;
   while (!end_of_sequence) {
     std::vector<Tensor> next;
diff --git a/tensorflow/core/kernels/data/window_dataset_op_test.cc b/tensorflow/core/kernels/data/window_dataset_op_test.cc
index bef42f761ac..31839e5d88d 100644
--- a/tensorflow/core/kernels/data/window_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/window_dataset_op_test.cc
@@ -302,7 +302,8 @@ TEST_P(ParameterizedGetNextTest, GetNext) {
                                                  &window_dataset));
         std::unique_ptr<IteratorBase> window_dataset_iterator;
         TF_ASSERT_OK(window_dataset->MakeIterator(
-            iterator_ctx_.get(), test_case.dataset_params.iterator_prefix(),
+            iterator_ctx_.get(), /*parent=*/nullptr,
+            test_case.dataset_params.iterator_prefix(),
             &window_dataset_iterator));
         bool end_of_window_dataset = false;
         std::vector<Tensor> window_elements;

From c6b1ac0bacdf1235408f7df9c81ed89dfc032359 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Tue, 18 Feb 2020 10:44:29 -0800
Subject: [PATCH 137/442] [tf.data] Adding documentation and deprecations.

PiperOrigin-RevId: 295769370
Change-Id: Ibaaff550d1f2c91ec902ef77062b3fa70483f73e
---
 tensorflow/core/framework/dataset.h              | 6 +++++-
 tensorflow/core/kernels/data/captured_function.h | 7 +++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index a02960eec29..141e075b454 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -751,6 +751,7 @@ class DatasetBase : public core::RefCounted {
 
   // TODO(jsimsa): Remove this overlead once all callers are migrated to the API
   // that passes in the parent iterator pointer.
+  ABSL_DEPRECATED("Use the overload that passes the parent iterator pointer.")
   Status MakeIterator(IteratorContext* ctx, const string& output_prefix,
                       std::unique_ptr<IteratorBase>* iterator) const {
     return MakeIterator(ctx, /*parent=*/nullptr, output_prefix, iterator);
@@ -758,6 +759,7 @@ class DatasetBase : public core::RefCounted {
 
   // TODO(jsimsa): Remove this overlead once all callers are migrated to the API
   // that passes in the parent iterator pointer.
+  ABSL_DEPRECATED("Use the overload that passes the parent iterator pointer.")
   Status MakeIterator(IteratorContext&& ctx, const string& output_prefix,
                       std::unique_ptr<IteratorBase>* iterator) const {
     return MakeIterator(&ctx, output_prefix, iterator);
@@ -769,7 +771,8 @@ class DatasetBase : public core::RefCounted {
       IteratorStateReader* reader,
       std::unique_ptr<IteratorBase>* iterator) const {
     std::unique_ptr<IteratorBase> it;
-    TF_RETURN_IF_ERROR(MakeIterator(ctx, output_prefix, &it));
+    TF_RETURN_IF_ERROR(
+        MakeIterator(ctx, /*parent=*/nullptr, output_prefix, &it));
     TF_RETURN_IF_ERROR(it->Restore(ctx, reader));
     *iterator = std::move(it);
     return Status::OK();
@@ -809,6 +812,7 @@ class DatasetBase : public core::RefCounted {
   //
   // TODO(jsimsa): Remove this method once all `DatasetBase` implementations are
   // migrated over to `CheckExternalState`.
+  ABSL_DEPRECATED("Use CheckExternalState instead.")
   virtual bool IsStateful() const { return false; }
 
   // Indicates whether the dataset depends on any external state. If so, the
diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h
index a9d8343e023..1cb39644ed3 100644
--- a/tensorflow/core/kernels/data/captured_function.h
+++ b/tensorflow/core/kernels/data/captured_function.h
@@ -39,19 +39,26 @@ namespace data {
 class CapturedFunction;
 class InstantiatedCapturedFunction;
 
+// Creates an iterator for a dataset which is created by applying the given
+// function to the given input element.
 Status MakeIteratorFromInputElement(
     IteratorContext* ctx, const IteratorBase* parent,
     const std::vector<Tensor>& input_element, int64 thread_index,
     const InstantiatedCapturedFunction& inst_captured_func, StringPiece prefix,
     std::unique_ptr<IteratorBase>* out_iterator);
 
+// Creates an iterator for a dataset which is created by applying the given
+// function to the given input element.
+//
 // TODO(jsimsa): Remove this overload once all callers are migrated to the API
 // that passes in the parent iterator pointer.
+ABSL_DEPRECATED("Use the overload that passes the parent iterator pointer.")
 Status MakeIteratorFromInputElement(
     IteratorContext* ctx, const std::vector<Tensor>& input_element,
     int64 thread_index, const InstantiatedCapturedFunction& inst_captured_func,
     StringPiece prefix, std::unique_ptr<IteratorBase>* out_iterator);
 
+// Determines whether the given node is stateful.
 Status IsNodeStateful(const FunctionLibraryDefinition& library,
                       const NodeDef& node);
 

From bd395324d8edc35a2a1fafe6cf65cbd36950a897 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 10:46:12 -0800
Subject: [PATCH 138/442] Internal visibility whitelist change.

PiperOrigin-RevId: 295769793
Change-Id: I7d1f10e11d98b33f6f50f4fd9e428f83968b6dc6
---
 tensorflow/core/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index a6c1b80ff54..b89068c7a83 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -160,6 +160,7 @@ package_group(
         "//learning/freud/topic_models/tensorflow/...",
         "//perftools/accelerators/xprof/api/...",
         "//quality/webanswers/brain/tokenization/custom_tf_ops/kernels/...",
+        "//smartass/brain/server/...",
     ],
 )
 

From a66d4828f39268ebb178cf579a36dc8e8b0f967d Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Tue, 18 Feb 2020 10:58:27 -0800
Subject: [PATCH 139/442] [XLA] Run copy elision pass in a fixed point

Copies might not be elided due to lifetime collisions with other copies which
are yet to be removed. Running copy elision in a fixed point loop lets us elide
those copies as well.

Fixes #35874

PiperOrigin-RevId: 295773148
Change-Id: I2d70efa775dcb42c21ceb0d5078838dec2d60f06
---
 .../compiler/xla/service/copy_insertion.cc    | 72 +++++++++++++------
 .../xla/service/copy_insertion_test.cc        | 64 +++++++++++++++++
 2 files changed, 114 insertions(+), 22 deletions(-)

diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index 1f6107d6f36..c07c3eb3c3b 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -1043,15 +1043,31 @@ Status CopyInsertion::AddSpecialCaseCopies(const CallGraph& call_graph,
     HloInstruction* root = computation->root_instruction();
 
     // Mark nondistinct/ambiguous indices.
-    absl::flat_hash_set<const HloBuffer*> seen;
+    absl::flat_hash_map<const HloBuffer*, ShapeIndex> seen;
     ShapeUtil::ForEachSubshape(
         root->shape(), [&](const Shape& /*subshape*/, const ShapeIndex& index) {
           std::vector<const HloBuffer*> buffers_at_index =
               alias_analysis->ComputeBuffersAt(root, index);
           bool buffer_seen_before = false;
           for (const HloBuffer* buffer : buffers_at_index) {
-            buffer_seen_before |= !seen.insert(buffer).second;
+            buffer_seen_before |= !seen.emplace(buffer, index).second;
           }
+
+          if (buffer_seen_before && policy.copy_root_replicated_buffers &&
+              computation == module->entry_computation() &&
+              module->input_output_alias_config().OutputHasAlias(index) &&
+              buffers_at_index.size() == 1) {
+            absl::optional<HloInputOutputAliasConfig::Alias> alias =
+                module->input_output_alias_config().GetAliasedParameter(index);
+            CHECK(alias) << "Alias does not exist";
+            const ShapeIndex& other_index = seen[buffers_at_index[0]];
+            VLOG(2) << "Output indices " << index.ToString() << " and "
+                    << other_index.ToString() << " are both aliased to "
+                    << alias->parameter_number << " copying " << other_index;
+            add_index_to_copy(root, other_index);
+            return;
+          }
+
           if (buffers_at_index.size() > 1 ||
               (buffer_seen_before && policy.copy_root_replicated_buffers)) {
             VLOG(2) << "Index " << index << " of computation "
@@ -1097,6 +1113,18 @@ Status CopyInsertion::AddSpecialCaseCopies(const CallGraph& call_graph,
   return Status::OK();
 }
 
+static int64 GetNumExistingCopies(const HloModule* module) {
+  int64 num_existing_copies = 0;
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kCopy) {
+        ++num_existing_copies;
+      }
+    }
+  }
+  return num_existing_copies;
+}
+
 Status CopyInsertion::RemoveUnnecessaryCopies(const HloOrdering& ordering,
                                               HloModule* module) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
@@ -1112,13 +1140,24 @@ Status CopyInsertion::RemoveUnnecessaryCopies(const HloOrdering& ordering,
   }
 
   std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
-  for (HloComputation* computation : module->computations()) {
-    for (HloInstruction* instruction : computation->instructions()) {
-      if (instruction->opcode() == HloOpcode::kCopy &&
-          copy_remover.TryElideCopy(instruction)) {
-        TF_RETURN_IF_ERROR(StripControlDependenciesFrom(instruction));
-        TF_RETURN_IF_ERROR(
-            instruction->ReplaceAllUsesWith(instruction->mutable_operand(0)));
+
+  int64 num_existing_copies = GetNumExistingCopies(module);
+  bool changed = true;
+  int64 num_iterations = -1;
+  while (changed) {
+    CHECK_LE(++num_iterations, num_existing_copies);
+    changed = false;
+    VLOG(2) << "Running fixpoint iteration " << num_iterations
+            << " of copy elision";
+    for (HloComputation* computation : module->computations()) {
+      for (HloInstruction* instruction : computation->instructions()) {
+        if (instruction->opcode() == HloOpcode::kCopy &&
+            copy_remover.TryElideCopy(instruction)) {
+          changed = true;
+          TF_RETURN_IF_ERROR(StripControlDependenciesFrom(instruction));
+          TF_RETURN_IF_ERROR(
+              instruction->ReplaceAllUsesWith(instruction->mutable_operand(0)));
+        }
       }
     }
   }
@@ -1156,17 +1195,6 @@ StatusOr<bool> CopyInsertion::Run(HloModule* module) {
         "Call graph must be flattened before copy insertion.");
   }
 
-  int64 num_existing_copies = 0;
-  if (VLOG_IS_ON(1)) {
-    for (HloComputation* computation : module->computations()) {
-      for (HloInstruction* instruction : computation->instructions()) {
-        if (instruction->opcode() == HloOpcode::kCopy) {
-          ++num_existing_copies;
-        }
-      }
-    }
-  }
-
   TF_RETURN_IF_ERROR(AddCopiesToResolveInterference(module));
 
   // Simplify the tuple structures introduced by the deep copies. This should be
@@ -1185,7 +1213,6 @@ StatusOr<bool> CopyInsertion::Run(HloModule* module) {
       RemoveUnnecessaryCopies(DependencyHloOrdering(module), module));
   DumpHloModuleDuringPassIfEnabled(name(), "after removing unnecessary copies",
                                    *module);
-
   TF_RETURN_IF_ERROR(AddSpecialCaseCopies(*call_graph, module));
   DumpHloModuleDuringPassIfEnabled(name(), "after adding special-case copies",
                                    *module);
@@ -1202,7 +1229,8 @@ StatusOr<bool> CopyInsertion::Run(HloModule* module) {
         }
       }
     }
-    VLOG(1) << "Num copies before copy-insertion: " << num_existing_copies;
+    VLOG(1) << "Num copies before copy-insertion: "
+            << GetNumExistingCopies(module);
     VLOG(1) << "Num copies after copy-insertion: " << num_total_copies;
   }
 
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index 8587c79ffb1..d58ee0ef20b 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -2274,5 +2274,69 @@ ENTRY TestComputation {
               op::While(op::Copy(op::Parameter())));
 }
 
+TEST_F(CopyInsertionTest, FixpointComputationRequired) {
+  const string& hlo_string = R"(
+HloModule Module
+
+fused_computation {
+  param0 = f32[3,3,96,1] parameter(0)
+  param1 = f32[] parameter(1)
+  broadcast = f32[3,3,96,1] broadcast(f32[] param1), dimensions={}
+  ROOT %add.0 = f32[3,3,96,1] add(f32[3,3,96,1] param0, f32[3,3,96,1] broadcast)
+}
+
+ENTRY entry_computation {
+  arg0 = f32[3,3,96,1] parameter(0)
+  arg1 = f32[] parameter(1)
+  fusion = f32[3,3,96,1] fusion(f32[3,3,96,1] arg0, f32[] arg1),
+    kind=kLoop, calls=fused_computation
+  negate = f32[] negate(f32[] arg1)
+  ROOT tuple = (f32[3,3,96,1], f32[3,3,96,1], f32[], f32[]) tuple(
+    f32[3,3,96,1] fusion,
+    f32[3,3,96,1] arg0,
+    f32[] negate,
+    f32[] arg1)
+}
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  // Set up the aliasing manually which normally would be set by
+  // alias_passthrough_params pass.
+  ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{1},
+      /*param_number=*/0,
+      /*param_index=*/{}, HloInputOutputAliasConfig::AliasKind::kUserAlias));
+  ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{3},
+      /*param_number=*/1,
+      /*param_index=*/{}, HloInputOutputAliasConfig::AliasKind::kUserAlias));
+
+  InsertCopies(module.get());
+
+  // There should be no copies inserted.
+  EXPECT_EQ(CountCopies(*module), 0);
+}
+
+TEST_F(CopyInsertionTest, NoAliasCheckViolation) {
+  const string& hlo_string = R"(
+HloModule cluster
+
+ENTRY Entry {
+  %arg = f32[8,28,28,1] parameter(0)
+  %bitcast.2 = f32[8,1,28,28] bitcast(f32[8,28,28,1] %arg)
+  ROOT %tuple.1 = (f32[8,1,28,28], f32[8,28,28,1]) tuple(f32[8,1,28,28] %bitcast.2, f32[8,28,28,1] %arg)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{1},
+      /*param_number=*/0,
+      /*param_index=*/{}, HloInputOutputAliasConfig::AliasKind::kUserAlias));
+  InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
+}
+
 }  // namespace
 }  // namespace xla

From eb02f932c8f95b452456b9d5ac98df69dcfd84ea Mon Sep 17 00:00:00 2001
From: Niranjan Hasabnis <niranjan.hasabnis@intel.com>
Date: Tue, 18 Feb 2020 11:56:46 -0800
Subject: [PATCH 140/442] Addressing review comments + adding missing comments
 for #endif

---
 tensorflow/core/kernels/mkl_matmul_op_fused.cc  |  4 ++--
 tensorflow/core/kernels/mkl_matmul_ops_common.h | 10 +++++-----
 tensorflow/core/kernels/mkl_qmatmul_op.cc       |  4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_matmul_op_fused.cc b/tensorflow/core/kernels/mkl_matmul_op_fused.cc
index 20d5ce3a1ec..213ace98681 100644
--- a/tensorflow/core/kernels/mkl_matmul_op_fused.cc
+++ b/tensorflow/core/kernels/mkl_matmul_op_fused.cc
@@ -151,7 +151,7 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T> {
         if (input_md != matmul_pd->src_desc()) {
 #else
         if (input_md.data.format != MKL_TENSOR_FORMAT_NC) {
-#endif
+#endif  // ENABLE_MKLDNN_V1
           src_mkl.SetUsrMem(input_md, src_data);
           src_mkl.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
               matmul_pd.get()->PRIMITIVE_DESC_SRC, this->cpu_engine_));
@@ -165,7 +165,7 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T> {
         if (input_md != matmul_pd->weight_desc()) {
 #else
         if (input_md.data.format != weight_format) {
-#endif
+#endif  // ENABLE_MKLDNN_V1
           weight_mkl.SetUsrMem(input_md, weight_data);
           weight_mkl.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
               matmul_pd.get()->PRIMITIVE_DESC_WEIGHTS, this->cpu_engine_));
diff --git a/tensorflow/core/kernels/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl_matmul_ops_common.h
index 3147921b8d3..10ba39ed005 100644
--- a/tensorflow/core/kernels/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl_matmul_ops_common.h
@@ -115,7 +115,7 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
   // of the memory layout. Hence, these functions are disabled for v1.x.
   memory::format GetSrcMemoryFormat() const { return context_.src_fmt; }
   memory::format GetWeightMemoryFormat() const { return context_.weight_fmt; }
-#endif  // ENABLE_MKLDNN_V1
+#endif  // !ENABLE_MKLDNN_V1
 
   std::shared_ptr<mkldnn::inner_product_forward::primitive_desc>
   GetPrimitiveDesc() const {
@@ -190,7 +190,7 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
                                               MEMORY_FORMAT::any));
 #else
                                               matmul_fwd_params.weight_fmt));
-#endif
+#endif  // ENABLE_MKLDNN_V1
 
     context_.dst_md.reset(new memory::desc({matmul_fwd_params.dst_dims},
                                            MklDnnType<Toutput>(),
@@ -260,7 +260,7 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
 
     context_.weight_fmt = static_cast<mkldnn::memory::format>(
         context_.fwd_pd.get()->weights_primitive_desc().desc().data.format);
-#endif
+#endif  // !ENABLE_MKLDNN_V1
 
     // Create memory primitive based on dummy data
     context_.src_mem.reset(new MEMORY_CONSTRUCTOR(
@@ -285,7 +285,7 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
     context_.matmul_fwd.reset(new inner_product_forward(
         *context_.fwd_pd, *context_.src_mem, *context_.weight_mem,
         *context_.bias_mem, *context_.dst_mem));
-#endif
+#endif  // ENABLE_MKLDNN_V1
 
     context_.fwd_primitives.push_back(*context_.matmul_fwd);
     return;
@@ -538,7 +538,7 @@ void dnnl_gemm(char transa, char transb, int64_t m, int64_t n, int64_t k,
   dnnl_gemm_exec(a_md, b_md, c_md, static_cast<void*>(a), static_cast<void*>(b),
                  static_cast<void*>(c), attr);
 }
-#endif  // ENABLE_MKLDNN_V1
+#endif  // ENABLE_MKLDNN_V1_2
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/mkl_qmatmul_op.cc b/tensorflow/core/kernels/mkl_qmatmul_op.cc
index 743bf641298..01c0892a8cb 100644
--- a/tensorflow/core/kernels/mkl_qmatmul_op.cc
+++ b/tensorflow/core/kernels/mkl_qmatmul_op.cc
@@ -273,7 +273,7 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Toutput> {
 #else
           weight_data = GetCachedWeight(
               context, static_cast<int32>(matmul_fwd->GetWeightMemoryFormat()));
-#endif
+#endif  // ENABLE_MKLDNN_V1
           is_weight_cached = (weight_data != nullptr);
         }
 
@@ -466,7 +466,7 @@ class MklDnnQuantizedMatMulOp : public MklDnnMatMulOpBase<Toutput> {
         net.push_back(
             mkldnn::reorder(reorder_desc, *input_bias_, *scaled_bias_));
         reorder_stream.submit(net).wait();
-#endif
+#endif  // ENABLE_MKLDNN_V1
 
         return reinterpret_cast<Tbias*>(scaled_bias_->get_data_handle());
       } else {

From d9c9c92c7c47401f2ac6862ba9a7d2cfd65775a0 Mon Sep 17 00:00:00 2001
From: Meghna Natraj <mnatraj@google.com>
Date: Tue, 18 Feb 2020 11:11:28 -0800
Subject: [PATCH 141/442] Fix bug in dropout.

PiperOrigin-RevId: 295776879
Change-Id: Ic25abd0fe0e442f37a32c7f68307e43728658b71
---
 tensorflow/examples/speech_commands/models.py | 58 +++++++++----------
 .../examples/speech_commands/models_test.py   | 26 ++++-----
 tensorflow/examples/speech_commands/train.py  |  8 +--
 3 files changed, 45 insertions(+), 47 deletions(-)

diff --git a/tensorflow/examples/speech_commands/models.py b/tensorflow/examples/speech_commands/models.py
index 1b9dff9136b..c35d1b662f8 100644
--- a/tensorflow/examples/speech_commands/models.py
+++ b/tensorflow/examples/speech_commands/models.py
@@ -187,7 +187,7 @@ def create_single_fc_model(fingerprint_input, model_settings, is_training):
     placeholder.
   """
   if is_training:
-    dropout_prob = tf.compat.v1.placeholder(tf.float32, name='dropout_prob')
+    dropout_rate = tf.compat.v1.placeholder(tf.float32, name='dropout_rate')
   fingerprint_size = model_settings['fingerprint_size']
   label_count = model_settings['label_count']
   weights = tf.compat.v1.get_variable(
@@ -199,7 +199,7 @@ def create_single_fc_model(fingerprint_input, model_settings, is_training):
                                    shape=[label_count])
   logits = tf.matmul(fingerprint_input, weights) + bias
   if is_training:
-    return logits, dropout_prob
+    return logits, dropout_rate
   else:
     return logits
 
@@ -253,7 +253,7 @@ def create_conv_model(fingerprint_input, model_settings, is_training):
     placeholder.
   """
   if is_training:
-    dropout_prob = tf.compat.v1.placeholder(tf.float32, name='dropout_prob')
+    dropout_rate = tf.compat.v1.placeholder(tf.float32, name='dropout_rate')
   input_frequency_size = model_settings['fingerprint_width']
   input_time_size = model_settings['spectrogram_length']
   fingerprint_4d = tf.reshape(fingerprint_input,
@@ -276,7 +276,7 @@ def create_conv_model(fingerprint_input, model_settings, is_training):
                             padding='SAME') + first_bias
   first_relu = tf.nn.relu(first_conv)
   if is_training:
-    first_dropout = tf.nn.dropout(first_relu, 1 - (dropout_prob))
+    first_dropout = tf.nn.dropout(first_relu, rate=dropout_rate)
   else:
     first_dropout = first_relu
   max_pool = tf.nn.max_pool2d(input=first_dropout,
@@ -303,7 +303,7 @@ def create_conv_model(fingerprint_input, model_settings, is_training):
                              padding='SAME') + second_bias
   second_relu = tf.nn.relu(second_conv)
   if is_training:
-    second_dropout = tf.compat.v1.nn.dropout(second_relu, dropout_prob)
+    second_dropout = tf.nn.dropout(second_relu, rate=dropout_rate)
   else:
     second_dropout = second_relu
   second_conv_shape = second_dropout.get_shape()
@@ -325,7 +325,7 @@ def create_conv_model(fingerprint_input, model_settings, is_training):
       shape=[label_count])
   final_fc = tf.matmul(flattened_second_conv, final_fc_weights) + final_fc_bias
   if is_training:
-    return final_fc, dropout_prob
+    return final_fc, dropout_rate
   else:
     return final_fc
 
@@ -377,7 +377,7 @@ def create_low_latency_conv_model(fingerprint_input, model_settings,
     placeholder.
   """
   if is_training:
-    dropout_prob = tf.compat.v1.placeholder(tf.float32, name='dropout_prob')
+    dropout_rate = tf.compat.v1.placeholder(tf.float32, name='dropout_rate')
   input_frequency_size = model_settings['fingerprint_width']
   input_time_size = model_settings['spectrogram_length']
   fingerprint_4d = tf.reshape(fingerprint_input,
@@ -402,7 +402,7 @@ def create_low_latency_conv_model(fingerprint_input, model_settings,
       padding='VALID') + first_bias
   first_relu = tf.nn.relu(first_conv)
   if is_training:
-    first_dropout = tf.nn.dropout(first_relu, 1 - (dropout_prob))
+    first_dropout = tf.nn.dropout(first_relu, rate=dropout_rate)
   else:
     first_dropout = first_relu
   first_conv_output_width = math.floor(
@@ -426,7 +426,7 @@ def create_low_latency_conv_model(fingerprint_input, model_settings,
       shape=[first_fc_output_channels])
   first_fc = tf.matmul(flattened_first_conv, first_fc_weights) + first_fc_bias
   if is_training:
-    second_fc_input = tf.nn.dropout(first_fc, 1 - (dropout_prob))
+    second_fc_input = tf.nn.dropout(first_fc, rate=dropout_rate)
   else:
     second_fc_input = first_fc
   second_fc_output_channels = 128
@@ -440,7 +440,7 @@ def create_low_latency_conv_model(fingerprint_input, model_settings,
       shape=[second_fc_output_channels])
   second_fc = tf.matmul(second_fc_input, second_fc_weights) + second_fc_bias
   if is_training:
-    final_fc_input = tf.nn.dropout(second_fc, 1 - (dropout_prob))
+    final_fc_input = tf.nn.dropout(second_fc, rate=dropout_rate)
   else:
     final_fc_input = second_fc
   label_count = model_settings['label_count']
@@ -454,7 +454,7 @@ def create_low_latency_conv_model(fingerprint_input, model_settings,
       shape=[label_count])
   final_fc = tf.matmul(final_fc_input, final_fc_weights) + final_fc_bias
   if is_training:
-    return final_fc, dropout_prob
+    return final_fc, dropout_rate
   else:
     return final_fc
 
@@ -515,7 +515,7 @@ def create_low_latency_svdf_model(fingerprint_input, model_settings,
       ValueError: If the inputs tensor is incorrectly shaped.
   """
   if is_training:
-    dropout_prob = tf.compat.v1.placeholder(tf.float32, name='dropout_prob')
+    dropout_rate = tf.compat.v1.placeholder(tf.float32, name='dropout_rate')
 
   input_frequency_size = model_settings['fingerprint_width']
   input_time_size = model_settings['spectrogram_length']
@@ -525,12 +525,12 @@ def create_low_latency_svdf_model(fingerprint_input, model_settings,
   if len(input_shape) != 2:
     raise ValueError('Inputs to `SVDF` should have rank == 2.')
   if input_shape[-1].value is None:
-    raise ValueError('The last dimension of the inputs to `SVDF` '
+    raise ValueError('The last dimension of the input to `SVDF` '
                      'should be defined. Found `None`.')
   if input_shape[-1].value % input_frequency_size != 0:
-    raise ValueError('Inputs feature dimension %d must be a multiple of '
-                     'frame size %d', fingerprint_input.shape[-1].value,
-                     input_frequency_size)
+    raise ValueError('The last dimension of the input to `SVDF` = {0} must be '
+                     'a multiple of the frame size = {1}'.format(
+                         input_shape.shape[-1].value, input_frequency_size))
 
   # Set number of units (i.e. nodes) and rank.
   rank = 2
@@ -545,9 +545,7 @@ def create_low_latency_svdf_model(fingerprint_input, model_settings,
       trainable=False,
       name='runtime-memory')
   first_time_flag = tf.compat.v1.get_variable(
-      name="first_time_flag",
-      dtype=tf.int32,
-      initializer=1)
+      name='first_time_flag', dtype=tf.int32, initializer=1)
   # Determine the number of new frames in the input, such that we only operate
   # on those. For training we do not use the memory, and thus use all frames
   # provided in the input.
@@ -624,7 +622,7 @@ def create_low_latency_svdf_model(fingerprint_input, model_settings,
   first_relu = tf.nn.relu(first_bias)
 
   if is_training:
-    first_dropout = tf.nn.dropout(first_relu, 1 - (dropout_prob))
+    first_dropout = tf.nn.dropout(first_relu, rate=dropout_rate)
   else:
     first_dropout = first_relu
 
@@ -639,7 +637,7 @@ def create_low_latency_svdf_model(fingerprint_input, model_settings,
       shape=[first_fc_output_channels])
   first_fc = tf.matmul(first_dropout, first_fc_weights) + first_fc_bias
   if is_training:
-    second_fc_input = tf.nn.dropout(first_fc, 1 - (dropout_prob))
+    second_fc_input = tf.nn.dropout(first_fc, rate=dropout_rate)
   else:
     second_fc_input = first_fc
   second_fc_output_channels = 256
@@ -653,7 +651,7 @@ def create_low_latency_svdf_model(fingerprint_input, model_settings,
       shape=[second_fc_output_channels])
   second_fc = tf.matmul(second_fc_input, second_fc_weights) + second_fc_bias
   if is_training:
-    final_fc_input = tf.nn.dropout(second_fc, 1 - (dropout_prob))
+    final_fc_input = tf.nn.dropout(second_fc, rate=dropout_rate)
   else:
     final_fc_input = second_fc
   label_count = model_settings['label_count']
@@ -667,7 +665,7 @@ def create_low_latency_svdf_model(fingerprint_input, model_settings,
       shape=[label_count])
   final_fc = tf.matmul(final_fc_input, final_fc_weights) + final_fc_bias
   if is_training:
-    return final_fc, dropout_prob
+    return final_fc, dropout_rate
   else:
     return final_fc
 
@@ -712,7 +710,7 @@ def create_tiny_conv_model(fingerprint_input, model_settings, is_training):
     placeholder.
   """
   if is_training:
-    dropout_prob = tf.compat.v1.placeholder(tf.float32, name='dropout_prob')
+    dropout_rate = tf.compat.v1.placeholder(tf.float32, name='dropout_rate')
   input_frequency_size = model_settings['fingerprint_width']
   input_time_size = model_settings['spectrogram_length']
   fingerprint_4d = tf.reshape(fingerprint_input,
@@ -736,7 +734,7 @@ def create_tiny_conv_model(fingerprint_input, model_settings, is_training):
       padding='SAME') + first_bias
   first_relu = tf.nn.relu(first_conv)
   if is_training:
-    first_dropout = tf.nn.dropout(first_relu, 1 - (dropout_prob))
+    first_dropout = tf.nn.dropout(first_relu, rate=dropout_rate)
   else:
     first_dropout = first_relu
   first_dropout_shape = first_dropout.get_shape()
@@ -759,7 +757,7 @@ def create_tiny_conv_model(fingerprint_input, model_settings, is_training):
   final_fc = (
       tf.matmul(flattened_first_dropout, final_fc_weights) + final_fc_bias)
   if is_training:
-    return final_fc, dropout_prob
+    return final_fc, dropout_rate
   else:
     return final_fc
 
@@ -817,7 +815,7 @@ def create_tiny_embedding_conv_model(fingerprint_input, model_settings,
     placeholder.
   """
   if is_training:
-    dropout_prob = tf.compat.v1.placeholder(tf.float32, name='dropout_prob')
+    dropout_rate = tf.compat.v1.placeholder(tf.float32, name='dropout_rate')
   input_frequency_size = model_settings['fingerprint_width']
   input_time_size = model_settings['spectrogram_length']
   fingerprint_4d = tf.reshape(fingerprint_input,
@@ -843,7 +841,7 @@ def create_tiny_embedding_conv_model(fingerprint_input, model_settings,
       padding='SAME') + first_bias
   first_relu = tf.nn.relu(first_conv)
   if is_training:
-    first_dropout = tf.nn.dropout(first_relu, 1 - (dropout_prob))
+    first_dropout = tf.nn.dropout(first_relu, rate=dropout_rate)
 
   else:
     first_dropout = first_relu
@@ -870,7 +868,7 @@ def create_tiny_embedding_conv_model(fingerprint_input, model_settings,
       padding='SAME') + second_bias
   second_relu = tf.nn.relu(second_conv)
   if is_training:
-    second_dropout = tf.nn.dropout(second_relu, 1 - (dropout_prob))
+    second_dropout = tf.nn.dropout(second_relu, rate=dropout_rate)
   else:
     second_dropout = second_relu
 
@@ -894,6 +892,6 @@ def create_tiny_embedding_conv_model(fingerprint_input, model_settings,
   final_fc = (
       tf.matmul(flattened_second_dropout, final_fc_weights) + final_fc_bias)
   if is_training:
-    return final_fc, dropout_prob
+    return final_fc, dropout_rate
   else:
     return final_fc
diff --git a/tensorflow/examples/speech_commands/models_test.py b/tensorflow/examples/speech_commands/models_test.py
index bae5fdec0a2..2b5bf668f2b 100644
--- a/tensorflow/examples/speech_commands/models_test.py
+++ b/tensorflow/examples/speech_commands/models_test.py
@@ -53,12 +53,12 @@ class ModelsTest(test.TestCase):
     model_settings = self._modelSettings()
     with self.cached_session() as sess:
       fingerprint_input = tf.zeros([1, model_settings["fingerprint_size"]])
-      logits, dropout_prob = models.create_model(fingerprint_input,
-                                                 model_settings, "conv", True)
+      logits, dropout_rate = models.create_model(
+          fingerprint_input, model_settings, "conv", True)
       self.assertIsNotNone(logits)
-      self.assertIsNotNone(dropout_prob)
+      self.assertIsNotNone(dropout_rate)
       self.assertIsNotNone(sess.graph.get_tensor_by_name(logits.name))
-      self.assertIsNotNone(sess.graph.get_tensor_by_name(dropout_prob.name))
+      self.assertIsNotNone(sess.graph.get_tensor_by_name(dropout_rate.name))
 
   @test_util.run_deprecated_v1
   def testCreateModelConvInference(self):
@@ -75,24 +75,24 @@ class ModelsTest(test.TestCase):
     model_settings = self._modelSettings()
     with self.cached_session() as sess:
       fingerprint_input = tf.zeros([1, model_settings["fingerprint_size"]])
-      logits, dropout_prob = models.create_model(
+      logits, dropout_rate = models.create_model(
           fingerprint_input, model_settings, "low_latency_conv", True)
       self.assertIsNotNone(logits)
-      self.assertIsNotNone(dropout_prob)
+      self.assertIsNotNone(dropout_rate)
       self.assertIsNotNone(sess.graph.get_tensor_by_name(logits.name))
-      self.assertIsNotNone(sess.graph.get_tensor_by_name(dropout_prob.name))
+      self.assertIsNotNone(sess.graph.get_tensor_by_name(dropout_rate.name))
 
   @test_util.run_deprecated_v1
   def testCreateModelFullyConnectedTraining(self):
     model_settings = self._modelSettings()
     with self.cached_session() as sess:
       fingerprint_input = tf.zeros([1, model_settings["fingerprint_size"]])
-      logits, dropout_prob = models.create_model(
+      logits, dropout_rate = models.create_model(
           fingerprint_input, model_settings, "single_fc", True)
       self.assertIsNotNone(logits)
-      self.assertIsNotNone(dropout_prob)
+      self.assertIsNotNone(dropout_rate)
       self.assertIsNotNone(sess.graph.get_tensor_by_name(logits.name))
-      self.assertIsNotNone(sess.graph.get_tensor_by_name(dropout_prob.name))
+      self.assertIsNotNone(sess.graph.get_tensor_by_name(dropout_rate.name))
 
   def testCreateModelBadArchitecture(self):
     model_settings = self._modelSettings()
@@ -108,12 +108,12 @@ class ModelsTest(test.TestCase):
     model_settings = self._modelSettings()
     with self.cached_session() as sess:
       fingerprint_input = tf.zeros([1, model_settings["fingerprint_size"]])
-      logits, dropout_prob = models.create_model(
+      logits, dropout_rate = models.create_model(
           fingerprint_input, model_settings, "tiny_conv", True)
       self.assertIsNotNone(logits)
-      self.assertIsNotNone(dropout_prob)
+      self.assertIsNotNone(dropout_rate)
       self.assertIsNotNone(sess.graph.get_tensor_by_name(logits.name))
-      self.assertIsNotNone(sess.graph.get_tensor_by_name(dropout_prob.name))
+      self.assertIsNotNone(sess.graph.get_tensor_by_name(dropout_rate.name))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/examples/speech_commands/train.py b/tensorflow/examples/speech_commands/train.py
index c9ddf8e92a0..e917a51d837 100644
--- a/tensorflow/examples/speech_commands/train.py
+++ b/tensorflow/examples/speech_commands/train.py
@@ -132,7 +132,7 @@ def main(_):
   else:
     fingerprint_input = input_placeholder
 
-  logits, dropout_prob = models.create_model(
+  logits, dropout_rate = models.create_model(
       fingerprint_input,
       model_settings,
       FLAGS.model_architecture,
@@ -248,7 +248,7 @@ def main(_):
             fingerprint_input: train_fingerprints,
             ground_truth_input: train_ground_truth,
             learning_rate_input: learning_rate_value,
-            dropout_prob: 0.5
+            dropout_rate: 0.5
         })
     train_writer.add_summary(train_summary, training_step)
     tf.compat.v1.logging.info(
@@ -271,7 +271,7 @@ def main(_):
             feed_dict={
                 fingerprint_input: validation_fingerprints,
                 ground_truth_input: validation_ground_truth,
-                dropout_prob: 1.0
+                dropout_rate: 0.0
             })
         validation_writer.add_summary(validation_summary, training_step)
         batch_size = min(FLAGS.batch_size, set_size - i)
@@ -305,7 +305,7 @@ def main(_):
         feed_dict={
             fingerprint_input: test_fingerprints,
             ground_truth_input: test_ground_truth,
-            dropout_prob: 1.0
+            dropout_rate: 0.0
         })
     batch_size = min(FLAGS.batch_size, set_size - i)
     total_accuracy += (test_accuracy * batch_size) / set_size

From 149f584de1b48e63849102a084292eb66ba90252 Mon Sep 17 00:00:00 2001
From: Jakob Buchgraber <buchgr@google.com>
Date: Tue, 18 Feb 2020 11:23:47 -0800
Subject: [PATCH 142/442] tensorflow .bazelrc: clean up of deprecated and noop
 flags

PiperOrigin-RevId: 295780391
Change-Id: I51ffd687048fb36d9a44c52add7b3f4de0bf354f
---
 .bazelrc | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index dbdadc98ea7..5f9173b9d36 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -320,10 +320,8 @@ build:xla --define=with_xla_support=true
 # Options when using remote execution
 # WARNING: THESE OPTIONS WONT WORK IF YOU DO NOT HAVE PROPER AUTHENTICATION AND PERMISSIONS
 build:rbe --action_env=BAZEL_DO_NOT_DETECT_CPP_TOOLCHAIN=1
-build:rbe --auth_enabled=true
-build:rbe --auth_scope=https://www.googleapis.com/auth/cloud-source-tools
+build:rbe --google_default_credentials
 build:rbe --bes_backend=buildeventservice.googleapis.com
-build:rbe --bes_best_effort=false
 build:rbe --bes_results_url="https://source.cloud.google.com/results/invocations"
 build:rbe --bes_timeout=600s
 build:rbe --define=EXECUTOR=remote
@@ -336,7 +334,7 @@ build:rbe --spawn_strategy=remote,worker,standalone,local
 test:rbe --test_env=USER=anon
 # Attempt to minimize the amount of data transfer between bazel and the remote
 # workers:
-build:rbe --experimental_inmemory_jdeps_files --experimental_inmemory_dotd_files --experimental_remote_download_outputs=toplevel
+build:rbe --remote_download_toplevel
 
 build:rbe_linux --config=rbe
 build:rbe_linux --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"

From 26bf35aec55bd805c3b1c8482cd5d59e4a687c75 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Tue, 18 Feb 2020 11:24:01 -0800
Subject: [PATCH 143/442] Preserve the directives annotation while lowering
 break statements.

PiperOrigin-RevId: 295780462
Change-Id: I48fa59628c110aafe250ba20b7b6cdf2cae73e26
---
 .../autograph/converters/break_statements.py  |  9 +++++-
 .../converters/break_statements_test.py       | 30 +++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/autograph/converters/break_statements.py b/tensorflow/python/autograph/converters/break_statements.py
index c5409077a66..718c5bd3ca5 100644
--- a/tensorflow/python/autograph/converters/break_statements.py
+++ b/tensorflow/python/autograph/converters/break_statements.py
@@ -71,6 +71,7 @@ class BreakTransformer(converter.Base):
     return nodes, break_used
 
   def visit_While(self, node):
+    original_node = node
     scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
     break_var = self.ctx.namer.new_symbol('break_', scope.referenced)
 
@@ -98,9 +99,13 @@ class BreakTransformer(converter.Base):
           body=node.body,
           orelse=guarded_orelse)
 
+      new_while_node = node[1]
+      anno.copyanno(original_node, new_while_node, anno.Basic.DIRECTIVES)
+
     return node
 
   def visit_For(self, node):
+    original_node = node
     scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
     break_var = self.ctx.namer.new_symbol('break_', scope.referenced)
 
@@ -137,7 +142,9 @@ class BreakTransformer(converter.Base):
           body=node.body,
           orelse=guarded_orelse)
 
-      anno.setanno(node[1], 'extra_test', extra_test)
+      new_for_node = node[1]
+      anno.setanno(new_for_node, 'extra_test', extra_test)
+      anno.copyanno(original_node, new_for_node, anno.Basic.DIRECTIVES)
 
     return node
 
diff --git a/tensorflow/python/autograph/converters/break_statements_test.py b/tensorflow/python/autograph/converters/break_statements_test.py
index c789ced095d..37accdcc1be 100644
--- a/tensorflow/python/autograph/converters/break_statements_test.py
+++ b/tensorflow/python/autograph/converters/break_statements_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.converters import break_statements
 from tensorflow.python.autograph.core import converter_testing
+from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.framework import constant_op
 from tensorflow.python.platform import test
 
@@ -46,6 +47,21 @@ class BreakCanonicalizationTest(converter_testing.TestCase):
     self.assertTransformedEquivalent(test_fn, 1)
     self.assertTransformedEquivalent(test_fn, 4)
 
+  def test_while_loop_preserves_directives(self):
+
+    def test_fn(x):
+      while x > 0:
+        x -= 1
+        if x % 2 == 0:
+          break
+
+    node, ctx = self.prepare(test_fn, {})
+    fake_annotation = object()
+    anno.setanno(node.body[0], anno.Basic.DIRECTIVES, fake_annotation)
+    node = break_statements.transform(node, ctx)
+    self.assertIs(
+        anno.getanno(node.body[1], anno.Basic.DIRECTIVES), fake_annotation)
+
   def test_for_loop(self):
 
     def test_fn(a):
@@ -63,6 +79,20 @@ class BreakCanonicalizationTest(converter_testing.TestCase):
       # but the section following the break will be skipped.
       self.assertEqual([3], result.test_fn([5, 4]))
 
+  def test_for_loop_preserves_directives(self):
+
+    def test_fn(a):
+      for x in a:
+        if x % 2 == 0:
+          break
+
+    node, ctx = self.prepare(test_fn, {})
+    fake_annotation = object()
+    anno.setanno(node.body[0], anno.Basic.DIRECTIVES, fake_annotation)
+    node = break_statements.transform(node, ctx)
+    self.assertIs(
+        anno.getanno(node.body[1], anno.Basic.DIRECTIVES), fake_annotation)
+
   def test_nested(self):
 
     def test_fn(x):

From e4381fd70b46b0a860e9970dd18f427fe94c1291 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Tue, 18 Feb 2020 11:24:05 -0800
Subject: [PATCH 144/442] Change BreakupIslands pass from an Operation pass
 into a Function pass

This is fixing a crash when there are external functions in the module.
The subtle difference between:

  OperationPass<BreakUpIslands, FuncOp>

and:

  FunctionPass<BreakUpIslands>

is that the latter will skip over external functions (functions without a body)
but not the former.

PiperOrigin-RevId: 295780488
Change-Id: I032e806bbc7d8e80375fa776bdc8873f850d7c58
---
 .../compiler/mlir/tensorflow/tests/breakup-islands.mlir   | 4 ++++
 .../compiler/mlir/tensorflow/translate/breakup-islands.cc | 8 ++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir b/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
index d90c9201a83..8659f52e301 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
@@ -3,6 +3,10 @@
 
 // All tests also test for idempotence.
 
+// Test that external functions aren't processed (used to crash).
+// CHECK-LABEL: func @unused_external_func
+func @unused_external_func()
+
 func @multiple_return(%arg0: tensor<*xi32>, %arg1: tensor<i32>) -> (tensor<*xi32>, tensor<*xi32>) {
   %graph:2 = tf_executor.graph {
     %island:3 = tf_executor.island {
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc b/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
index cef1f4e5567..d40eec62cdc 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
@@ -42,8 +42,8 @@ namespace mlir {
 
 namespace {
 
-struct BreakUpIslands : OperationPass<BreakUpIslands, FuncOp> {
-  void runOnOperation() final;
+struct BreakUpIslands : FunctionPass<BreakUpIslands> {
+  void runOnFunction() final;
 
   void BreakUpIsland(tf_executor::IslandOp island_op,
                      const TF::SideEffectAnalysis& side_effect_analysis,
@@ -51,8 +51,8 @@ struct BreakUpIslands : OperationPass<BreakUpIslands, FuncOp> {
                          new_control_inputs);
 };
 
-void BreakUpIslands::runOnOperation() {
-  auto graph_op_range = getOperation().getBody().front().without_terminator();
+void BreakUpIslands::runOnFunction() {
+  auto graph_op_range = getFunction().getBody().front().without_terminator();
   tf_executor::GraphOp graph_op;
   if (graph_op_range.begin() != graph_op_range.end() &&
       std::next(graph_op_range.begin()) == graph_op_range.end()) {

From 846939973bb9665cec2770b2edbdb478f1258e21 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 11:26:52 -0800
Subject: [PATCH 145/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 295781318 Change-Id:
 I571a76eccfccca4151c29969ccd78b659fbabe87

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index ffa9931d561..86be1ef98aa 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21329,7 +21329,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22037,7 +22037,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22233,7 +22233,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22302,7 +22302,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22417,7 +22417,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22476,7 +22476,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22650,7 +22650,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22841,7 +22841,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25281,7 +25281,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25613,7 +25613,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25663,7 +25663,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25913,7 +25913,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26543,7 +26543,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27608,7 +27608,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45467,7 +45467,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 24839fe95cf43bf4f154f530b48d1aa427756486 Mon Sep 17 00:00:00 2001
From: Lu Wang <luwa@google.com>
Date: Tue, 18 Feb 2020 11:29:43 -0800
Subject: [PATCH 146/442] Create the Java build artifact for MetadataExtractor

PiperOrigin-RevId: 295782134
Change-Id: I4e617b56bcd9406aa709913f407d828932c593fb
---
 .../lite/experimental/support/java/BUILD      | 23 +++++++++++++++++--
 .../support/common/SupportPreconditions.java  | 11 ++++-----
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/tensorflow/lite/experimental/support/java/BUILD b/tensorflow/lite/experimental/support/java/BUILD
index 1d392578afa..e6b964bcae8 100644
--- a/tensorflow/lite/experimental/support/java/BUILD
+++ b/tensorflow/lite/experimental/support/java/BUILD
@@ -1,13 +1,14 @@
 # Description:
 # TensorFlow Lite Support API in Java.
 
+load("@build_bazel_rules_android//android:rules.bzl", "android_library")
+load("//tensorflow/java:build_defs.bzl", "JAVACOPTS")
+
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
 )
 
-load("@build_bazel_rules_android//android:rules.bzl", "android_library")
-
 # TODO(138904786): Split Java part and Android part to make the support library usable by pure Java.
 android_library(
     name = "tensorflow-lite-support",
@@ -26,3 +27,21 @@ alias(
     name = "tensorflowlite_support",
     actual = ":tensorflow-lite-support",
 )
+
+java_library(
+    name = "tensorflow-lite-support-precondition",
+    srcs = ["src/java/org/tensorflow/lite/support/common/SupportPreconditions.java"],
+    javacopts = JAVACOPTS,
+    deps = [
+        "@org_checkerframework_qual",
+    ],
+)
+
+android_library(
+    name = "tensorflow-lite-support-precondition-lib-android",
+    srcs = ["src/java/org/tensorflow/lite/support/common/SupportPreconditions.java"],
+    manifest = "AndroidManifest.xml",
+    deps = [
+        "@org_checkerframework_qual",
+    ],
+)
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/SupportPreconditions.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/SupportPreconditions.java
index d4c4b4dcb23..8620e13eec7 100644
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/SupportPreconditions.java
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/common/SupportPreconditions.java
@@ -15,8 +15,6 @@ limitations under the License.
 
 package org.tensorflow.lite.support.common;
 
-import android.text.TextUtils;
-import org.checkerframework.checker.nullness.qual.NonNull;
 import org.checkerframework.checker.nullness.qual.Nullable;
 
 /** Static error checking util methods. */
@@ -28,7 +26,7 @@ public final class SupportPreconditions {
    * @return the non-null reference that was validated
    * @throws NullPointerException if {@code reference} is null
    */
-  public static <T extends @NonNull Object> T checkNotNull(T reference) {
+  public static <T extends Object> T checkNotNull(T reference) {
     if (reference == null) {
       throw new NullPointerException("The object reference is null.");
     }
@@ -44,8 +42,7 @@ public final class SupportPreconditions {
    * @return the non-null reference that was validated
    * @throws NullPointerException if {@code reference} is null
    */
-  public static <T extends @NonNull Object> T checkNotNull(
-      T reference, @Nullable Object errorMessage) {
+  public static <T extends Object> T checkNotNull(T reference, @Nullable Object errorMessage) {
     if (reference == null) {
       throw new NullPointerException(String.valueOf(errorMessage));
     }
@@ -60,7 +57,7 @@ public final class SupportPreconditions {
    * @throws IllegalArgumentException if {@code string} is null or empty
    */
   public static String checkNotEmpty(String string) {
-    if (TextUtils.isEmpty(string)) {
+    if (string == null || string.length() == 0) {
       throw new IllegalArgumentException("Given String is empty or null.");
     }
     return string;
@@ -76,7 +73,7 @@ public final class SupportPreconditions {
    * @throws IllegalArgumentException if {@code string} is null or empty
    */
   public static String checkNotEmpty(String string, Object errorMessage) {
-    if (TextUtils.isEmpty(string)) {
+    if (string == null || string.length() == 0) {
       throw new IllegalArgumentException(String.valueOf(errorMessage));
     }
     return string;

From f60fc7a072182df99ddbef50a873e8a544341855 Mon Sep 17 00:00:00 2001
From: Jakob Buchgraber <buchgr@google.com>
Date: Tue, 18 Feb 2020 11:30:58 -0800
Subject: [PATCH 147/442] remote config: replace all uses of os.environ by
 get_host_environ

This change is in prepartion for rolling out remote config. It will
allow us to inject environment variables from repository rules as
well as from the shell enviroment.

PiperOrigin-RevId: 295782466
Change-Id: I1eb61fca3556473e94f2f12c45ee5eb1fe51625b
---
 third_party/gpus/cuda_configure.bzl         | 39 ++++++++--------
 third_party/gpus/rocm_configure.bzl         | 50 +++++++++------------
 third_party/nccl/nccl_configure.bzl         |  6 +--
 third_party/remote_config/common.bzl        | 10 ++++-
 third_party/tensorrt/tensorrt_configure.bzl |  7 +--
 5 files changed, 54 insertions(+), 58 deletions(-)

diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 6fbe306457f..1f132e96f2c 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -43,6 +43,7 @@ load(
     "execute",
     "get_bash_bin",
     "get_cpu_value",
+    "get_host_environ",
     "get_python_bin",
     "is_windows",
     "raw_exec",
@@ -223,10 +224,9 @@ def find_cc(repository_ctx):
         cc_path_envvar = _GCC_HOST_COMPILER_PATH
     cc_name = target_cc_name
 
-    if cc_path_envvar in repository_ctx.os.environ:
-        cc_name_from_env = repository_ctx.os.environ[cc_path_envvar].strip()
-        if cc_name_from_env:
-            cc_name = cc_name_from_env
+    cc_name_from_env = get_host_environ(repository_ctx, cc_path_envvar)
+    if cc_name_from_env:
+        cc_name = cc_name_from_env
     if cc_name.startswith("/"):
         # Absolute path, maybe we should make this supported by our which function.
         return cc_name
@@ -365,7 +365,7 @@ def _cuda_include_path(repository_ctx, cuda_config):
 
 def enable_cuda(repository_ctx):
     """Returns whether to build with CUDA support."""
-    return int(repository_ctx.os.environ.get("TF_NEED_CUDA", False))
+    return int(get_host_environ(repository_ctx, "TF_NEED_CUDA", False))
 
 def matches_version(environ_version, detected_version):
     """Checks whether the user-specified version matches the detected version.
@@ -409,9 +409,9 @@ _DEFINE_CUDNN_MAJOR = "#define CUDNN_MAJOR"
 
 def compute_capabilities(repository_ctx):
     """Returns a list of strings representing cuda compute capabilities."""
-    if _TF_CUDA_COMPUTE_CAPABILITIES not in repository_ctx.os.environ:
+    capabilities_str = get_host_environ(repository_ctx, _TF_CUDA_COMPUTE_CAPABILITIES)
+    if capabilities_str == None:
         return _DEFAULT_CUDA_COMPUTE_CAPABILITIES
-    capabilities_str = repository_ctx.os.environ[_TF_CUDA_COMPUTE_CAPABILITIES]
     capabilities = capabilities_str.split(",")
     for capability in capabilities:
         # Workaround for Skylark's lack of support for regex. This check should
@@ -805,18 +805,13 @@ def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir):
 )""" % (name, "\n".join(outs), src_dir, out_dir)
 
 def _flag_enabled(repository_ctx, flag_name):
-    if flag_name in repository_ctx.os.environ:
-        value = repository_ctx.os.environ[flag_name].strip()
-        return value == "1"
-    return False
+    return get_host_environ(repository_ctx, flag_name) == "1"
 
 def _use_cuda_clang(repository_ctx):
     return _flag_enabled(repository_ctx, "TF_CUDA_CLANG")
 
 def _tf_sysroot(repository_ctx):
-    if _TF_SYSROOT in repository_ctx.os.environ:
-        return repository_ctx.os.environ[_TF_SYSROOT]
-    return ""
+    return get_host_environ(repository_ctx, _TF_SYSROOT, "")
 
 def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
     capability_flags = [
@@ -1006,9 +1001,10 @@ def _create_local_cuda_repository(repository_ctx):
     if is_cuda_clang:
         cuda_defines["%{cuda_toolkit_path}"] = cuda_config.config["cuda_toolkit_path"]
 
-    host_compiler_prefix = "/usr/bin"
-    if _GCC_HOST_COMPILER_PREFIX in repository_ctx.os.environ:
-        host_compiler_prefix = repository_ctx.os.environ[_GCC_HOST_COMPILER_PREFIX].strip()
+    host_compiler_prefix = get_host_environ(repository_ctx, _GCC_HOST_COMPILER_PREFIX)
+    if not host_compiler_prefix:
+        host_compiler_prefix = "/usr/bin"
+
     cuda_defines["%{host_compiler_prefix}"] = host_compiler_prefix
 
     # Bazel sets '-B/usr/bin' flag to workaround build errors on RHEL (see
@@ -1157,14 +1153,15 @@ def _cuda_autoconf_impl(repository_ctx):
     """Implementation of the cuda_autoconf repository rule."""
     if not enable_cuda(repository_ctx):
         _create_dummy_repository(repository_ctx)
-    elif _TF_CUDA_CONFIG_REPO in repository_ctx.os.environ:
-        if (_TF_CUDA_VERSION not in repository_ctx.os.environ or
-            _TF_CUDNN_VERSION not in repository_ctx.os.environ):
+    elif get_host_environ(repository_ctx, _TF_CUDA_CONFIG_REPO) != None:
+        has_cuda_version = get_host_environ(repository_ctx, _TF_CUDA_VERSION) != None
+        has_cudnn_version = get_host_environ(repository_ctx, _TF_CUDNN_VERSION) != None
+        if not has_cuda_version or not has_cudnn_version:
             auto_configure_fail("%s and %s must also be set if %s is specified" %
                                 (_TF_CUDA_VERSION, _TF_CUDNN_VERSION, _TF_CUDA_CONFIG_REPO))
         _create_remote_cuda_repository(
             repository_ctx,
-            repository_ctx.os.environ[_TF_CUDA_CONFIG_REPO],
+            get_host_environ(repository_ctx, _TF_CUDA_CONFIG_REPO),
         )
     else:
         _create_local_cuda_repository(repository_ctx)
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index de885f71d18..063271b83f2 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -26,6 +26,7 @@ load(
     "files_exist",
     "get_bash_bin",
     "get_cpu_value",
+    "get_host_environ",
     "raw_exec",
     "realpath",
     "which",
@@ -79,10 +80,9 @@ def find_cc(repository_ctx):
     cc_path_envvar = _GCC_HOST_COMPILER_PATH
     cc_name = target_cc_name
 
-    if cc_path_envvar in repository_ctx.os.environ:
-        cc_name_from_env = repository_ctx.os.environ[cc_path_envvar].strip()
-        if cc_name_from_env:
-            cc_name = cc_name_from_env
+    cc_name_from_env = get_host_environ(repository_ctx, cc_path_envvar)
+    if cc_name_from_env:
+        cc_name = cc_name_from_env
     if cc_name.startswith("/"):
         # Absolute path, maybe we should make this supported by our which function.
         return cc_name
@@ -252,13 +252,12 @@ def _rocm_include_path(repository_ctx, rocm_config):
     return inc_dirs
 
 def _enable_rocm(repository_ctx):
-    if "TF_NEED_ROCM" in repository_ctx.os.environ:
-        enable_rocm = repository_ctx.os.environ["TF_NEED_ROCM"].strip()
-        if enable_rocm == "1":
-            if get_cpu_value(repository_ctx) != "Linux":
-                auto_configure_warning("ROCm configure is only supported on Linux")
-                return False
-            return True
+    enable_rocm = get_host_environ(repository_ctx, "TF_NEED_ROCM")
+    if enable_rocm == "1":
+        if get_cpu_value(repository_ctx) != "Linux":
+            auto_configure_warning("ROCm configure is only supported on Linux")
+            return False
+        return True
     return False
 
 def _rocm_toolkit_path(repository_ctx, bash_bin):
@@ -270,18 +269,16 @@ def _rocm_toolkit_path(repository_ctx, bash_bin):
     Returns:
       A speculative real path of the rocm toolkit install directory.
     """
-    rocm_toolkit_path = _DEFAULT_ROCM_TOOLKIT_PATH
-    if _ROCM_TOOLKIT_PATH in repository_ctx.os.environ:
-        rocm_toolkit_path = repository_ctx.os.environ[_ROCM_TOOLKIT_PATH].strip()
+    rocm_toolkit_path = get_host_environ(repository_ctx, _ROCM_TOOLKIT_PATH, _DEFAULT_ROCM_TOOLKIT_PATH)
     if files_exist(repository_ctx, [rocm_toolkit_path], bash_bin) != [True]:
         auto_configure_fail("Cannot find rocm toolkit path.")
     return realpath(repository_ctx, rocm_toolkit_path, bash_bin)
 
 def _amdgpu_targets(repository_ctx):
     """Returns a list of strings representing AMDGPU targets."""
-    if _TF_ROCM_AMDGPU_TARGETS not in repository_ctx.os.environ:
+    amdgpu_targets_str = get_host_environ(repository_ctx, _TF_ROCM_AMDGPU_TARGETS)
+    if not amdgpu_targets_str:
         return _DEFAULT_ROCM_AMDGPU_TARGETS
-    amdgpu_targets_str = repository_ctx.os.environ[_TF_ROCM_AMDGPU_TARGETS]
     amdgpu_targets = amdgpu_targets_str.split(",")
     for amdgpu_target in amdgpu_targets:
         if amdgpu_target[:3] != "gfx" or not amdgpu_target[3:].isdigit():
@@ -308,9 +305,9 @@ def _hipcc_env(repository_ctx):
         "HCC_AMDGPU_TARGET",
         "HIP_PLATFORM",
     ]:
-        if name in repository_ctx.os.environ:
-            hipcc_env = (hipcc_env + " " + name + "=\"" +
-                         repository_ctx.os.environ[name].strip() + "\";")
+        env_value = get_host_environ(repository_ctx, name)
+        if env_value:
+            hipcc_env = (hipcc_env + " " + name + "=\"" + env_value + "\";")
     return hipcc_env.strip()
 
 def _hipcc_is_hipclang(repository_ctx, rocm_config, bash_bin):
@@ -328,7 +325,7 @@ def _hipcc_is_hipclang(repository_ctx, rocm_config, bash_bin):
 
     #  check user-defined hip-clang environment variables
     for name in ["HIP_CLANG_PATH", "HIP_VDI_HOME"]:
-        if name in repository_ctx.os.environ:
+        if get_host_environ(repository_ctx, name):
             return "True"
 
     # grep for "HIP_COMPILER=clang" in /opt/rocm/hip/lib/.hipInfo
@@ -367,10 +364,7 @@ def _crosstool_verbose(repository_ctx):
     Returns:
         A string containing value of environment variable CROSSTOOL_VERBOSE.
     """
-    name = "CROSSTOOL_VERBOSE"
-    if name in repository_ctx.os.environ:
-        return repository_ctx.os.environ[name].strip()
-    return "0"
+    return get_host_environ(repository_ctx, "CROSSTOOL_VERBOSE", "0")
 
 def _lib_name(lib, version = "", static = False):
     """Constructs the name of a library on Linux.
@@ -701,9 +695,7 @@ def _create_local_rocm_repository(repository_ctx):
 
     host_compiler_includes = get_cxx_inc_directories(repository_ctx, cc)
 
-    host_compiler_prefix = "/usr/bin"
-    if _GCC_HOST_COMPILER_PREFIX in repository_ctx.os.environ:
-        host_compiler_prefix = repository_ctx.os.environ[_GCC_HOST_COMPILER_PREFIX].strip()
+    host_compiler_prefix = get_host_environ(repository_ctx, _GCC_HOST_COMPILER_PREFIX, "/usr/bin")
 
     rocm_defines = {}
 
@@ -823,10 +815,10 @@ def _rocm_autoconf_impl(repository_ctx):
     """Implementation of the rocm_autoconf repository rule."""
     if not _enable_rocm(repository_ctx):
         _create_dummy_repository(repository_ctx)
-    elif _TF_ROCM_CONFIG_REPO in repository_ctx.os.environ:
+    elif get_host_environ(repository_ctx, _TF_ROCM_CONFIG_REPO) != None:
         _create_remote_rocm_repository(
             repository_ctx,
-            repository_ctx.os.environ[_TF_ROCM_CONFIG_REPO],
+            get_host_environ(repository_ctx, _TF_ROCM_CONFIG_REPO),
         )
     else:
         _create_local_rocm_repository(repository_ctx)
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index 952276a0701..363a65f1f43 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -20,6 +20,7 @@ load(
 load(
     "//third_party/remote_config:common.bzl",
     "get_cpu_value",
+    "get_host_environ",
 )
 
 _CUDA_TOOLKIT_PATH = "CUDA_TOOLKIT_PATH"
@@ -76,9 +77,8 @@ def _nccl_configure_impl(repository_ctx):
     # See https://github.com/tensorflow/tensorflow/commit/62bd3534525a036f07d9851b3199d68212904778
     find_cuda_config_path = repository_ctx.path(Label("@org_tensorflow//third_party/gpus:find_cuda_config.py"))
 
-    nccl_version = ""
-    if _TF_NCCL_VERSION in repository_ctx.os.environ:
-        nccl_version = repository_ctx.os.environ[_TF_NCCL_VERSION].strip()
+    nccl_version = get_host_environ(repository_ctx, _TF_NCCL_VERSION, "")
+    if nccl_version:
         nccl_version = nccl_version.split(".")[0]
 
     cuda_config = find_cuda_config(repository_ctx, find_cuda_config_path, ["cuda"])
diff --git a/third_party/remote_config/common.bzl b/third_party/remote_config/common.bzl
index 6f6e4be2304..353e9bb1a63 100644
--- a/third_party/remote_config/common.bzl
+++ b/third_party/remote_config/common.bzl
@@ -135,7 +135,7 @@ def get_environ(repository_ctx, name, default_value = None):
         return default_value
     return result.stdout
 
-def get_host_environ(repository_ctx, name):
+def get_host_environ(repository_ctx, name, default_value = None):
     """Returns the value of an environment variable on the host platform.
 
     The host platform is the machine that Bazel runs on.
@@ -147,7 +147,13 @@ def get_host_environ(repository_ctx, name):
     Returns:
       The value of the environment variable 'name' on the host platform.
     """
-    return repository_ctx.os.environ.get(name)
+    if name in repository_ctx.os.environ:
+        return repository_ctx.os.environ.get(name).strip()
+
+    if hasattr(repository_ctx.attr, "environ") and name in repository_ctx.attr.environ:
+        return repository_ctx.attr.environ.get(name).strip()
+
+    return default_value
 
 def is_windows(repository_ctx):
     """Returns true if the execution platform is Windows.
diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
index 1d780e855cc..b3375dc224f 100644
--- a/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -15,6 +15,7 @@ load(
 load(
     "//third_party/remote_config:common.bzl",
     "get_cpu_value",
+    "get_host_environ",
 )
 
 _TENSORRT_INSTALL_PATH = "TENSORRT_INSTALL_PATH"
@@ -72,14 +73,14 @@ def _create_dummy_repository(repository_ctx):
 
 def enable_tensorrt(repository_ctx):
     """Returns whether to build with TensorRT support."""
-    return int(repository_ctx.os.environ.get(_TF_NEED_TENSORRT, False))
+    return int(get_host_environ(repository_ctx, _TF_NEED_TENSORRT, False))
 
 def _tensorrt_configure_impl(repository_ctx):
     """Implementation of the tensorrt_configure repository rule."""
 
-    if _TF_TENSORRT_CONFIG_REPO in repository_ctx.os.environ:
+    if get_host_environ(repository_ctx, _TF_TENSORRT_CONFIG_REPO) != None:
         # Forward to the pre-configured remote repository.
-        remote_config_repo = repository_ctx.os.environ[_TF_TENSORRT_CONFIG_REPO]
+        remote_config_repo = get_host_environ(repository_ctx, _TF_TENSORRT_CONFIG_REPO)
         repository_ctx.template("BUILD", Label(remote_config_repo + ":BUILD"), {})
         repository_ctx.template(
             "build_defs.bzl",

From b7796f3c856965f68699b2b527fc2872e2aa71ad Mon Sep 17 00:00:00 2001
From: Jakob Buchgraber <buchgr@google.com>
Date: Tue, 18 Feb 2020 11:50:58 -0800
Subject: [PATCH 148/442] cuda_configure: make find_cuda_config() compatible
 with remote execution

repository_ctx.execute() does not support uploading of files from the source tree. I initially tried constructing a command that simply embeds the file's contents. However that did not work on Windows because the file is larger than 8192 characters. So my best idea was to compress it locally and embed the compressed contents in the command and to uncompress it remotely. This works but comes with the drawback that we need to compress it first. This can't be done as part of the repository_rule either because within one repository_rule every execute() runs either locally or remotely. I thus decided to check in the compressed version in the source tree. It's very much a temporary measure as I'll add the ability to upload files to a future version of Bazel.

PiperOrigin-RevId: 295787408
Change-Id: I1545dd86cdec7e4b20cba43d6a134ad6d1a08109
---
 tensorflow/opensource_only.files              |  1 +
 third_party/gpus/compress_find_cuda_config.py | 37 +++++++++++++++++++
 third_party/gpus/cuda_configure.bzl           | 31 +++++++++++++---
 .../gpus/find_cuda_config.py.gz.base64        |  1 +
 third_party/nccl/nccl_configure.bzl           |  2 +-
 third_party/tensorrt/tensorrt_configure.bzl   |  2 +-
 6 files changed, 67 insertions(+), 7 deletions(-)
 create mode 100644 third_party/gpus/compress_find_cuda_config.py
 create mode 100644 third_party/gpus/find_cuda_config.py.gz.base64

diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 4cec73276da..c282a6021ee 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -95,6 +95,7 @@ tensorflow/third_party/gpus/cuda/build_defs.bzl.tpl
 tensorflow/third_party/gpus/cuda/cuda_config.h.tpl
 tensorflow/third_party/gpus/cuda_configure.bzl
 tensorflow/third_party/gpus/find_cuda_config.py
+tensorflow/third_party/gpus/find_cuda_config.py.gz.base64
 tensorflow/third_party/gpus/rocm/BUILD
 tensorflow/third_party/gpus/rocm/BUILD.tpl
 tensorflow/third_party/gpus/rocm/build_defs.bzl.tpl
diff --git a/third_party/gpus/compress_find_cuda_config.py b/third_party/gpus/compress_find_cuda_config.py
new file mode 100644
index 00000000000..98be39d9245
--- /dev/null
+++ b/third_party/gpus/compress_find_cuda_config.py
@@ -0,0 +1,37 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Compresses the contents of find_cuda_config.py.oss.
+
+The compressed file is what is actually being used. It works around remote
+config not being able to upload files yet.
+"""
+import base64
+import zlib
+
+
+def main():
+  with open('find_cuda_config.py.oss', 'rb') as f:
+    data = f.read()
+
+  compressed = zlib.compress(data)
+  b64encoded = base64.b64encode(compressed)
+
+  with open('find_cuda_config.py.gz.base64.oss', 'wb') as f:
+    f.write(b64encoded)
+
+
+if __name__ == '__main__':
+  main()
+
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 1f132e96f2c..5dcdfdbad73 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -579,14 +579,35 @@ def _cudart_static_linkopt(cpu_value):
     """Returns additional platform-specific linkopts for cudart."""
     return "" if cpu_value == "Darwin" else "\"-lrt\","
 
+def _exec_find_cuda_config(repository_ctx, script_path, cuda_libraries):
+    python_bin = get_python_bin(repository_ctx)
+
+    # If used with remote execution then repository_ctx.execute() can't
+    # access files from the source tree. A trick is to read the contents
+    # of the file in Starlark and embed them as part of the command. In
+    # this case the trick is not sufficient as the find_cuda_config.py
+    # script has more than 8192 characters. 8192 is the command length
+    # limit of cmd.exe on Windows. Thus we additionally need to compress
+    # the contents locally and decompress them as part of the execute().
+    compressed_contents = repository_ctx.read(script_path)
+    decompress_and_execute_cmd = (
+        "from zlib import decompress;" +
+        "from base64 import b64decode;" +
+        "from os import system;" +
+        "script = decompress(b64decode('%s'));" % compressed_contents +
+        "f = open('script.py', 'wb');" +
+        "f.write(script);" +
+        "f.close();" +
+        "system('%s script.py %s');" % (python_bin, " ".join(cuda_libraries))
+    )
+
+    return execute(repository_ctx, [python_bin, "-c", decompress_and_execute_cmd])
+
 # TODO(csigg): Only call once instead of from here, tensorrt_configure.bzl,
 # and nccl_configure.bzl.
 def find_cuda_config(repository_ctx, script_path, cuda_libraries):
     """Returns CUDA config dictionary from running find_cuda_config.py"""
-    exec_result = raw_exec(repository_ctx, [
-        get_python_bin(repository_ctx),
-        script_path,
-    ] + cuda_libraries)
+    exec_result = _exec_find_cuda_config(repository_ctx, script_path, cuda_libraries)
     if exec_result.return_code:
         auto_configure_fail("Failed to run find_cuda_config.py: %s" % err_out(exec_result))
 
@@ -858,7 +879,7 @@ def _create_local_cuda_repository(repository_ctx):
         "cuda:cuda_config.h",
     ]}
     tpl_paths["cuda:BUILD"] = _tpl_path(repository_ctx, "cuda:BUILD.windows" if is_windows(repository_ctx) else "cuda:BUILD")
-    find_cuda_config_script = repository_ctx.path(Label("@org_tensorflow//third_party/gpus:find_cuda_config.py"))
+    find_cuda_config_script = repository_ctx.path(Label("@org_tensorflow//third_party/gpus:find_cuda_config.py.gz.base64"))
 
     cuda_config = _get_cuda_config(repository_ctx, find_cuda_config_script)
 
diff --git a/third_party/gpus/find_cuda_config.py.gz.base64 b/third_party/gpus/find_cuda_config.py.gz.base64
new file mode 100644
index 00000000000..418acdfd5ac
--- /dev/null
+++ b/third_party/gpus/find_cuda_config.py.gz.base64
@@ -0,0 +1 @@
+eJzNXHtz47iR/5+fAseJy6RHpu291FZOF++V157JKvHZU7Zm9lK2o0AkLHFNkQofkpVUvvt1NwASBCm/JlsZVa1HAtFAd+PXLxDYd+w0W27yeDYv2XeHR//FxnPBxiItsvxjkq3ZSVXOs7wI2EmSsCvsVrArUYh8JaLAeee8Y+dxCN1FxKo0Ejkrgf5kyUP4Rz0ZsC8iL+IsZd8Fh8zDDq565Pr/DSNssoot+IalWcmqQsAQccHu40Qw8RiKZcnilIXZYpnEPA0FW8flnKZRgwAb7M9qiGxacujNof8Sft2b/RgviWH8zMtyOTw4WK/XASdmgyyfHSSyY3FwPjr9cHH9YR8YJpLPaSKKguXib1Wcg6jTDeNL4CfkU+Ay4WuW5YzPcgHPygz5XedxGaezASuy+3LNcwGjRHFR5vG0KlvK0tyBzGYHUBdPmXtyzUbXLvvx5Hp0PYAxfh6Nf7r8PGY/n1xdnVyMRx+u2eUVO728OBuNR5cX8OsjO7n4M/vT6OJswASoCqYRj8sc+QcmY1QjLR27FqLFwH0mGSqWIozv4xDkSmcVnwk2y1YiT0EcthT5Ii5wMQtgL4JRkngRl7yklo5QOM3xv/TjuK77KY9TgOHp57MTmH6a83yDzLC54Dh/BEsUllkeC+KRrST6AFIZMIiKJSk3RSkWgeMg4IswjwFnheA5YKEgVWwbHoFZtEcZwIqj1srCgcYFQiASJaoqJRXHuWaCBlpK/pE+zNL7eFblpECkK8ooq8qAuFrycl5IPNHoRIxUNQ5r0QBget0QgvM8q2ZzJtJVnGfpQqSls+J5jGgFUx7dg6mxFU/iyGIgVkoaSOGkVjS7xJzIc1r4XJRVTiBg0ATqCrNIKG0mAGO0Pak8XAagvY+BeRi/4ZIj27MKuQOmrqvlMssR+Q0Zmg0tgxenYVJF0BRWP56fXPsD+HJ2cTFgF6en5wNSjHRaV+P2kpb8AQeqeZpyQLqJkIYfMGqUevxxgnNOPp2Mf7p2DBUyrULkHPzRgu8XYslBdUA8S7IpTRIwY/Ykyx4kmiR4Cgc51aCSSCJ3Ned5tI8qjACExGhRTU027/NsgewB9yQCYSNwYClb/KKG0Y3WUoFu2OV1Y9ORuOdVUmI/cLXR0HEYGGtaPQ7BKx5URX6QZCFPDsIq4gNqkNqVWCQudpNIoobtL3cDoP8ZljZbF0NW88H61DYA/+Ow5nM6vL39lGeznC/YR1TF7e3Fl9HZ6IT94dNnCEuLZYUulI2zLHmIy9tbHP72ds9xPoJCpzx8AL8a0VIAd/E0TuJygw53IUwMJUVGUYUnYJEpdFxJFTod9WbSWfawDqtEoWTTNrQAVEewctSS7td67h1FaVuuPqvDIvRtsMVOoihGY+RJB6xAvf+VHxiCLIo+GjpfPlxdQ/DQ64JN48vL8z+NxrSaQCPNrqbBHy2qfhqwUGOei4v2RLJpdHE9Pjk/10RozzVz+MNijppMGukCJj+dXekhtCOgIcYQxy+vrsbGMHVTa2rni/al/QAIwdNNhU4pwKTBEB53MaTuPgabXXRx6JYF4IE7NQyUg4ZIvFiWG+xdpYafzhgPKcPh6Ub3Vf4LogDgH+fjKnBptwGma3AxdMr7ye/Vsx8mapAhA6aCv7efSR80AUANWRAE7YfqS/MUAq3jQLoATpnFmf6WFfobejz9fZnwEpnRv8Fvq2/gxpZ5FkL2UbdsCgfSguUmgbAyxJwHFXw825fP98F97fNyv8yWTplvhrCgZJzFHLxBwtQg63kczh2VHo6o7QOGobo7plJIUATFkq9TTYdRaCIeRViV2pPLoRqGRLqVH8cJEw551Cm5P5rQ+0A8gMp9nHsJj6EbeFk2iQtQKnhWj57IgFlrKpCZg+dDfoTpMPRzDcK1dKkvIFXO1yRe8DB7CekZz2GemnLBS0x+NIQ8HpYVTyY1hHXWoVtoAkDJ6VyED6hHQf6T/K9OdxZCqExHUcto76g4gEWFSGSjBrU9DUtVVg3Wh2ksmmVM2ZcapM1nPTZ+OmNZnbFHAeHQIHmNJ9WfI2Z/joJ22zivhEkQfGcTWL8/Qtx6hiL4zycpuh97DmJKr0Q+K4Y1ta23ISU1WmmdlAnCax7UxG0Vt0kxLQ4xX9L5jIxGkOCWPEkoCdUDXckMc8h+VsDC3nLsBl4Ssu2kOEtFIJEJfyFuW0sO7vMCukhhlX1I3UELVEVQgS1BRB4aWVedaOPA1njHNgCxhlt6vtMDv+NOU6s38dIZjedlgTWv1zFAbbm8nCSCF+XrTFfp8pjdgN/zVj7lqSvKRy0WoGQsPTdw/TtDqD7CrnQWqSEh+6HRhhZkJkodpbQomHgOWMoXQvsbBQx0Biq0wPRADsUWZdOnZqUWSBgghwlWY/A8zoJsKfTIbu5CdE4h94Zc89ityvv937l+kMMQSKD8KJNQowUM6KvnvpNzsp2Cvfduo/e+y3aI0QFN5RMZ4I+6a9tSKqC2YAZl2tI7Mhbfrb1xCMsuipinExAyqsLSu4/zooT8VkD2HdnKAOuhHB2z4WmcqoIcXCpRUQovCZVC1Hw3WREgWfBLFqfePYwuV/MeFSVJ8SfpVdLfmWuVRBNZCkyoPvD6uOqk2e36oTZTLFusmNmwiejRZDQXrAQFbs/Vza6P6ZV7UID4B3Uj0KlM6tjIRYIQY9ZEPvFuWiMDJPaX7p1P0bzEmkGuOpYZACgA8x47/oF5wZ7vypXDCILDi5Ls2MSanEGaQQtOKrUxkaVmU/DC3kEksCb0XF6Ecez6ElIq6/mcxvjwjLrUyY85YBMRuihEngMeRZ4GACwTYtdrI9M3sFlQee5hkevJAXzfBIMqLSdYOio80FfT56h1NtslT2YLsO7u4cqJRPUHB0KbgM+RtX6+BzIYRs7aSaoabNWBC1Shcv8A5PFasdSta1t3YD14Wxm72oGO4DFag7X0RU8Mt3njWvX5/k6BLsckAvBanVzVZLJt9UlT9w601W/ReoWVW+638+72RS4SWWrXm1i4F2JshcjdD8sbKRbdmllXbfx0GpRsduvePrmP/VlaNQ/FY5nz4uD086fx6ODJAWUfelb7OV0Y/Qskb238PCs69P7+t62frR/9suKDR5OsJfxeWzQwrQnVlhPaw/Nwv0HKOajF0L+RZ+WhSAdNXzRXVwaQG/c21SDb3Sl2EZ7krVF6+hLXfqQZwL+TXsac8LWDtqnvDLdlFmtaKadZlUTkWWhbEqvvHbkTB9+MnbfNEKw0ze7pn8ZYPUMZXU0ZgtXWQ3UnUr1Kx6aAPQnB9qGUh6NIju9RgB7L9QD/eK2QLyOeObFvpSr4DLXJ40K8HTItRSg7aBFrM5MJ1LZaU5udTE3kFjZpCMxMFwGiaHaRN01+0RMBDAYBbyYHFDzQ4nQUsupqm9bdKfaCaIMUgE5MeDV2qel9a3Bwtk/Wae3PEzn1zfDojiK0gEDfy9aTfEBkKDL3GW6emx41tWc4si1Qt93oE+hQE4nXGcxguxC2CAOGke4VyOLSiso5Lzslp0SbVk5Tbfz77LZTnxrSkufyVcf4rZs+Hf8wsCbd7i8U9ROLqnqYQAaISb1jHqXl0qFga1VMYJLJRkvlKpPpdzRvwIbKZ16MjpahPAF2K+VqM/6MA3larJaGKH2UOZ+FzM7Kw6RIt6VKV9BocIe7A9tLetd89aBrG5Xvt3L8WmO4bWNm7u5OFOxE5HD1pAcH7Ojw8HBQc7FDv2W77zgqxTY0W6u14XsrclwkDubuc9p/0Yq8YYyuLn2nUwNZ4rB3+CoAtw0E49NsJYJaC6ZLrsXui80kd166A6tAMfCQrsKwDw1GLDoFWiaraP26O0uk/Qt80XUbvb8N4M+AffH0d/rju7VX1HX11lJeV/D7ihco5Huq757tnDqd2154d+vo7fs5bdBCAymIUgtQBf4IxKNwO4kJhXLZwdVUagvM0HC9Wk+4j56KYhqnsgAYNOwMrPrRXkxfMr9atGGyfTrsewAoisQqDo06q4BSyS7bsFqBijuK+T4VtKWskS1yYNetW/aCo8NgGrrKmJdlbFrzdv463tQlYrBmvx7oNQYB/d0eJ1kbF27CyddpMJK9yVIvq6+Va7x+6+2v9O/LTer/Oz/RU8tYoxRXRyUqa5blI5dHYGrtBUQ+vjy79MIins38Ib2yQpOaZkBqbtbhpjzoLNASqTnq+kzHcs1smuX4In7eTlQsZYDuAkgd/cGLiG3VEPVBPYAvM/sudzeHd+w/jvseHN2pzR9KT8za0B2lEAWLGIROS/lawtTqECvEVcFoz2WL7+7OR+ulPME/NOpNg3OHbftr9VFbEyg59LNBYUcyv01sqG0bsYl3i7pZMc2hsYamqVvzdFasGRUt7DmJLHP2LfLnZLKN2BLKXBotlo0bpPinlSBNE168JEWy4uNQ7z123s9YW3bg0Y5c2f0lGZaZY5m7lM+8PjF6YiilWAR277nNeY7J/5788fLKpeSsaRtdYNu2fMXs+ulkfPpTEy51nqarUNNJMvb2JIzWgy/jt6RiX52H9Sdh6FbVIZmqUE55wX/JmpN/WZpsArVpTQJsS9nMCvvwzjHz4k651gZSe+BmI6fr6xSrdZZcsCgT8uiYzIzI/zWPt/s8xrz2tJYR0GI3+xPv8HVwLuQ5tXqCOL2HSmrFITPEUxlTAQBVr4bRNgbsl6pQe3V07gVf7NkYel3oNyBkLZ/xnpgOQaKZ8KKtEA9San/LYtoAQ/lfk1jgcG5nLfvjCHV52qdu8aZE+bQ77ThS2y1GafrrFI6NU3upS7MdGp4sM3wZ/ZRuTP8kV3X+4cuHc+2vnvJWX1Ewpunr3dRXOqmtdSIs2Mu9zqvKwzSV1WEzw7bUp+khQ3Dze9Du9SZcI+FXwToNw+TbRDWdc6xBLX9pTNOvVvT9NdCMuvlGwEzL9GtgGQcGjZrj9yLZ7IB5r/Fz0OrzBhgT3VehuKQzsHn5jSL5S3MutwG02fhUytnq2cL8O/ZXxcRfMXxzECgVOYd6lmXTX6CwxdPhbE3XLqBjyaC2oy1cuh2gso6qiNOZGg6ee4sqLuMlbv/HC0iRpiLJ1ir2NxqhQxGtdFBlbDxJvEbP2iDqnSGc5Is6WybfR8rrFwv+IDrZHtOXLuhgR/CsmddnTN5q7xerUXov8m8ozVZnX4xctk4r9anv79kiW4nIPLZH+WW+qC/ZKLmU4oN5J5d8i5bq0b4RZSEAtBd4oaN8lZ9cxSg1WK49S6+7tDuBS7ObBp2+b3CdNe1b3Sda8gTPqU1EuvLgP7VLqk43Hd/c2W+F7KtA5EzopFvfTYLmhbQem86J1UePWmeTmuYb3ftOr9nANQ45KO5aZ/PEjIdSwheJweXmYetwL3hGGqVfElzn7iWOQd8djd47F3RipsxglF15IAnPrvzu+8n3v21OtuyyNZ3VWOZiFWdVkWxkGUi7ld3beQ5TN6PYGZ00Bi+Q4d0f6KA0RVPtDuRln3Us7z3iRSJ0DNbq+61zxE+tV/d06F+8g5u/HLC7Pf89CXa7ft9IdfA/vwHD6VvdJ0+N3rRfMxhnw56Drd8c+QE/mMR/l4bchHcDCHUX+ZJzIC/cai0VUDjO5QVIdfVAIVq5jM6Grt7m7j+CoaiIJBfLhIfCc29v6dSaCXDsoGTovLXsIjmKQ7pCldMd4Kduciq4SBGaW2PH7Ibns1UAsR7cvTwTiw307mlTBPj95mh4d+d03r9Zpwhd65IVSkaCtQ5QWevnti71PRtQXnT8sjMnvStrnVUwHsNq6ZUUj8BbIdfxTvp2deL1H/+U60q7q3RCs9agcTxzm4ydO2Nu6+yU8oN0ULVaRuBave4L62aCvm2nztrI4W7aO/B35j6OZrZhRBtkWUES6Nkn3s2RjGjqs98z7+hwwI7qrO8d+7GzraW2nNa8UFda5HZTffwMN56M1zv6aoTFa6OE/h2pXkSaV/hqTNoKb/L8np1wkwt7v6pvNRRQ0nQLUqByr6FiBzC3G2d60GLvcPSbYnMPcavcPTtcBoP2RkctHdWOPcJRPbddtk687BHNqnd7JDMvSz4jmLnH0fBmFb21VDql6pOsTre2S9cb+nsk7ElWe6S073M+I6ldB7f57cldSWq07oeBvtJCo8alWNRBC7TyAJxF8lqOSzkm3T5otdJbJqPoI9fzcIdKsqLwykyZ1Y00GesWHGo6GqMu5og5sRngOaRKGAdqu3FRc10zQVf5PXenGMo9feY1I/n99RW6ILV3j4FP/i8AAvzfWwivKHNPqNobH0KkKPH4g+NgsKcMZDKhq4aTCUoymbg4khTK+X+SX8ik
\ No newline at end of file
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index 363a65f1f43..eba838cd98e 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -75,7 +75,7 @@ def _nccl_configure_impl(repository_ctx):
     # function to be restarted with all previous state being lost. This
     # can easily lead to a O(n^2) runtime in the number of labels.
     # See https://github.com/tensorflow/tensorflow/commit/62bd3534525a036f07d9851b3199d68212904778
-    find_cuda_config_path = repository_ctx.path(Label("@org_tensorflow//third_party/gpus:find_cuda_config.py"))
+    find_cuda_config_path = repository_ctx.path(Label("@org_tensorflow//third_party/gpus:find_cuda_config.py.gz.base64"))
 
     nccl_version = get_host_environ(repository_ctx, _TF_NCCL_VERSION, "")
     if nccl_version:
diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
index b3375dc224f..3466ed3b3bb 100644
--- a/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -114,7 +114,7 @@ def _tensorrt_configure_impl(repository_ctx):
     # function to be restarted with all previous state being lost. This
     # can easily lead to a O(n^2) runtime in the number of labels.
     # See https://github.com/tensorflow/tensorflow/commit/62bd3534525a036f07d9851b3199d68212904778
-    find_cuda_config_path = repository_ctx.path(Label("@org_tensorflow//third_party/gpus:find_cuda_config.py"))
+    find_cuda_config_path = repository_ctx.path(Label("@org_tensorflow//third_party/gpus:find_cuda_config.py.gz.base64"))
     tpl_paths = {
         "build_defs.bzl": _tpl_path(repository_ctx, "build_defs.bzl"),
         "BUILD": _tpl_path(repository_ctx, "BUILD"),

From e76b3152129f6800065fc7b2816b438b13b2b9aa Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Tue, 18 Feb 2020 11:51:08 -0800
Subject: [PATCH 149/442] Expose ops.executing_eagerly_outside_functions() as
 new public API.

Docstring is also updated.

PiperOrigin-RevId: 295787462
Change-Id: Id123ebd73d901c007952d6883e8abb55253272a1
---
 tensorflow/python/framework/ops.py            | 25 ++++++++++++++++++-
 .../tools/api/golden/v1/tensorflow.pbtxt      |  4 +++
 tensorflow/tools/compatibility/renames_v2.py  |  2 ++
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 5ed9d59fc74..f716dfa33dd 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -5548,8 +5548,31 @@ def init_scope():
         outer_graph._device_function_stack = outer_device_stack  # pylint: disable=protected-access
 
 
+@tf_export(v1=["executing_eagerly_outside_functions"])
 def executing_eagerly_outside_functions():
-  """Returns True if executing eagerly, even if inside a graph function."""
+  """Returns True if executing eagerly, even if inside a graph function.
+
+  This function will check the outermost context for the program and see if
+  it is in eager mode. It is useful comparing to `tf.executing_eagerly()`,
+  which checks the current context and will return `False` within a
+  `tf.function` body. It can be used to build library that behave differently
+  in eager runtime and v1 session runtime (deprecated).
+
+  Example:
+
+  >>> tf.compat.v1.enable_eager_execution()
+  >>> @tf.function
+  ... def func():
+  ...   # A function constructs TensorFlow graphs, it does not execute eagerly,
+  ...   # but the outer most context is still eager.
+  ...   assert not tf.executing_eagerly()
+  ...   return tf.compat.v1.executing_eagerly_outside_functions()
+  >>> func()
+  <tf.Tensor: shape=(), dtype=bool, numpy=True>
+
+  Returns:
+    boolean, whether the outermost context is in eager mode.
+  """
   if context.executing_eagerly():
     return True
   else:
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index bcefb835e00..2f7c4e8bbd3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -1240,6 +1240,10 @@ tf_module {
     name: "executing_eagerly"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "executing_eagerly_outside_functions"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "exp"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/compatibility/renames_v2.py b/tensorflow/tools/compatibility/renames_v2.py
index 299c0a4a013..1a0afb6c804 100644
--- a/tensorflow/tools/compatibility/renames_v2.py
+++ b/tensorflow/tools/compatibility/renames_v2.py
@@ -366,6 +366,8 @@ renames = {
         'tf.compat.v1.estimator.tpu.TPUEstimatorSpec',
     'tf.estimator.tpu.experimental.EmbeddingConfigSpec':
         'tf.compat.v1.estimator.tpu.experimental.EmbeddingConfigSpec',
+    'tf.executing_eagerly_outside_functions':
+        'tf.compat.v1.executing_eagerly_outside_functions',
     'tf.experimental.output_all_intermediates':
         'tf.compat.v1.experimental.output_all_intermediates',
     'tf.expm1':

From 6a202bc94b845ca4bb3f67884f3683ee2492e825 Mon Sep 17 00:00:00 2001
From: Edward Loper <edloper@google.com>
Date: Tue, 18 Feb 2020 11:58:36 -0800
Subject: [PATCH 150/442] Update RaggedTensor.__getitem__ to (1) allow indexing
 into all uniform dimensions, and (2) preserve uniform dimensions.  In
 particular:

(1) When slicing a ragged dimension where uniform_row_length is defined, preserve uniform_row_length.
(2) Allow indexing into a ragged dimension where uniform_row_length is defined.

PiperOrigin-RevId: 295789259
Change-Id: I4bfacf02b8941aa9e96ca944bcc997b7669810c6
---
 .../python/ops/ragged/ragged_getitem.py       |  84 ++++++++++++---
 .../python/ops/ragged/ragged_tensor_test.py   | 100 ++++++++++++++++--
 2 files changed, 163 insertions(+), 21 deletions(-)

diff --git a/tensorflow/python/ops/ragged/ragged_getitem.py b/tensorflow/python/ops/ragged/ragged_getitem.py
index eca3cc3cdfa..ba4b13387b4 100644
--- a/tensorflow/python/ops/ragged/ragged_getitem.py
+++ b/tensorflow/python/ops/ragged/ragged_getitem.py
@@ -19,9 +19,12 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_gather_ops
@@ -41,9 +44,6 @@ def ragged_tensor_getitem(self, key):
   principles of Python ("In the face of ambiguity, refuse the temptation to
   guess"), we simply disallow this operation.
 
-  Any dimensions added by `array_ops.newaxis` will be ragged if the following
-  dimension is ragged.
-
   Args:
     self: The RaggedTensor to slice.
     key: Indicates which piece of the RaggedTensor to return, using standard
@@ -134,15 +134,26 @@ def _ragged_getitem(rt_input, key_list):
   # that puts all values in a single row.
   if row_key is array_ops.newaxis:
     inner_rt = _ragged_getitem(rt_input, inner_keys)
-    nsplits = array_ops.shape(inner_rt.row_splits,
-                              out_type=inner_rt.row_splits.dtype)[0]
-    return ragged_tensor.RaggedTensor.from_row_splits(
-        inner_rt, array_ops.stack([0, nsplits - 1]), validate=False)
+    nsplits = tensor_shape.dimension_at_index(inner_rt.row_splits.shape, 0)
+    if nsplits.value is not None:
+      nsplits = nsplits.value
+    else:
+      nsplits = array_ops.shape(inner_rt.row_splits,
+                                out_type=inner_rt.row_splits.dtype)[0]
+    return ragged_tensor.RaggedTensor.from_uniform_row_length(
+        inner_rt, nsplits - 1, nrows=1, validate=False)
 
   # Slicing a range of rows: first slice the outer dimension, and then
   # call `_ragged_getitem_inner_dimensions` to handle the inner keys.
   if isinstance(row_key, slice):
     sliced_rt_input = _slice_ragged_row_dimension(rt_input, row_key)
+    if rt_input.uniform_row_length is not None:
+      # If the inner dimension has uniform_row_length, then preserve it (by
+      # re-wrapping the values in a new RaggedTensor).  Note that the row
+      # length won't have changed, since we're slicing a range of rows (and not
+      # slicing the rows themselves).
+      sliced_rt_input = ragged_tensor.RaggedTensor.from_uniform_row_length(
+          sliced_rt_input.values, rt_input.uniform_row_length)
     return _ragged_getitem_inner_dimensions(sliced_rt_input, inner_keys)
 
   # Indexing a single row: slice values to get the indicated row, and then
@@ -245,11 +256,14 @@ def _ragged_getitem_inner_dimensions(rt_input, key_list):
   # RaggedTensor that puts each value in its own row.
   if column_key is array_ops.newaxis:
     inner_rt = _ragged_getitem_inner_dimensions(rt_input, key_list[1:])
-    nsplits = array_ops.shape(inner_rt.row_splits,
-                              out_type=inner_rt.row_splits.dtype)[0]
-    return ragged_tensor.RaggedTensor.from_row_splits(inner_rt,
-                                                      math_ops.range(nsplits),
-                                                      validate=False)
+    nsplits = tensor_shape.dimension_at_index(inner_rt.row_splits.shape, 0)
+    if nsplits.value is not None:
+      nsplits = nsplits.value
+    else:
+      nsplits = array_ops.shape(inner_rt.row_splits,
+                                out_type=inner_rt.row_splits.dtype)[0]
+    return ragged_tensor.RaggedTensor.from_uniform_row_length(
+        inner_rt, 1, nrows=nsplits - 1, validate=False)
 
   # Slicing a range of columns in a ragged inner dimension.  We use a
   # recursive call to process the values, and then assemble a RaggedTensor
@@ -292,15 +306,59 @@ def _ragged_getitem_inner_dimensions(rt_input, key_list):
             lambda: math_ops.maximum(limits + stop_offset, lower_bound))
       inner_rt = _build_ragged_tensor_from_value_ranges(
           inner_rt_starts, inner_rt_limits, column_key.step, rt_input.values)
+      # If the row dimension is uniform, then calculate the new
+      # uniform_row_length, and rebuild inner_rt using that uniform_row_lengths.
+      if rt_input.uniform_row_length is not None:
+        new_row_length = _slice_length(rt_input.uniform_row_length, column_key)
+        inner_rt = ragged_tensor.RaggedTensor.from_uniform_row_length(
+            inner_rt.values, new_row_length, rt_input.nrows())
       return inner_rt.with_values(
           _ragged_getitem_inner_dimensions(inner_rt.values, key_list[1:]))
 
   # Indexing a single column in a ragged inner dimension: raise an Exception.
   # See RaggedTensor.__getitem__.__doc__ for an explanation of why indexing
   # into a ragged inner dimension is problematic.
-  else:
+  if rt_input.uniform_row_length is None:
     raise ValueError("Cannot index into an inner ragged dimension.")
 
+  # Indexing a single column in a uniform inner dimension: check that the
+  # given index is in-bounds, and then use a strided slice over rt_input.values
+  # to take the indicated element from each row.
+  row_length = rt_input.uniform_row_length
+  column_key = math_ops.cast(column_key, row_length.dtype)
+  oob_err_msg = "Index out of bounds when indexing into a ragged tensor"
+  oob_checks = [
+      check_ops.assert_greater_equal(
+          column_key, -row_length, message=oob_err_msg),
+      check_ops.assert_less(column_key, row_length, message=oob_err_msg),
+  ]
+  with ops.control_dependencies(oob_checks):
+    offset = _if_ge_zero(column_key, lambda: column_key,
+                         lambda: row_length + column_key)
+    sliced_rt = rt_input.values[offset::row_length]
+    return _ragged_getitem_inner_dimensions(sliced_rt, key_list[1:])
+
+
+def _slice_length(value_length, slice_key):
+  """Computes the number of elements in a slice of a value with a given length.
+
+  Returns the equivalent of: `len(range(value_length)[slice_key])`
+
+  Args:
+    value_length: Scalar int `Tensor`: the length of the value being sliced.
+    slice_key: A `slice` object used to slice elements from the the value.
+
+  Returns:
+    The number of elements in the sliced value.
+  """
+  # Note: we could compute the slice length without creating a zeros tensor
+  # with some variant of (stop-start)//step, but doing so would require more
+  # ops (for checking bounds, handling negative indices, negative step sizes,
+  # etc); and we expect this to be an uncommon operation, so we use this
+  # simpler implementation.
+  zeros = array_ops.zeros(value_length, dtype=dtypes.bool)
+  return array_ops.size(zeros[slice_key], out_type=value_length.dtype)
+
 
 def _expand_ellipsis(key_list, num_remaining_dims):
   """Expands the ellipsis at the start of `key_list`.
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test.py b/tensorflow/python/ops/ragged/ragged_tensor_test.py
index 6bc066e5d84..f4c75d26699 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test.py
@@ -116,6 +116,12 @@ EXAMPLE_RAGGED_TENSOR_4D_VALUES = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10],
                                    [11, 12], [13, 14], [15, 16], [17, 18],
                                    [19, 20]]
 
+# Example 3D ragged tensor with uniform_row_lengths.
+EXAMPLE_RAGGED_TENSOR_3D = [[[1, 2, 3], [4], [5, 6]], [[], [7, 8, 9], []]]
+EXAMPLE_RAGGED_TENSOR_3D_ROWLEN = 3
+EXAMPLE_RAGGED_TENSOR_3D_SPLITS = [0, 3, 4, 6, 6, 9, 9]
+EXAMPLE_RAGGED_TENSOR_3D_VALUES = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+
 
 def int32array(values):
   return np.array(values, dtype=np.int32)
@@ -837,7 +843,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
   # RaggedTensor.__getitem__
   #=============================================================================
 
-  def _TestGetItem(self, rt, slice_spec, expected):
+  def _TestGetItem(self, rt, slice_spec, expected, expected_shape=None):
     """Helper function for testing RaggedTensor.__getitem__.
 
     Checks that calling `rt.__getitem__(slice_spec) returns the expected value.
@@ -855,6 +861,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
       slice_spec: The slice spec.
       expected: The expected value of rt.__getitem__(slice_spec), as a python
         list; or an exception class.
+      expected_shape: The expected shape for `rt.__getitem__(slice_spec)`.
     """
     tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
     tensor_slice_spec2 = _make_tensor_slice_spec(slice_spec, False)
@@ -864,13 +871,18 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     self.assertAllEqual(value1, expected, 'slice_spec=%s' % (slice_spec,))
     self.assertAllEqual(value2, expected, 'slice_spec=%s' % (slice_spec,))
     self.assertAllEqual(value3, expected, 'slice_spec=%s' % (slice_spec,))
+    if expected_shape is not None:
+      value1.shape.assert_is_compatible_with(expected_shape)
+      value2.shape.assert_is_compatible_with(expected_shape)
+      value3.shape.assert_is_compatible_with(expected_shape)
 
   def _TestGetItemException(self, rt, slice_spec, expected, message):
     """Helper function for testing RaggedTensor.__getitem__ exceptions."""
-    tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
-    self.assertRaisesRegexp(expected, message, rt.__getitem__, slice_spec)
-    self.assertRaisesRegexp(expected, message, rt.__getitem__,
-                            tensor_slice_spec1)
+    tensor_slice_spec = _make_tensor_slice_spec(slice_spec, True)
+    with self.assertRaisesRegexp(expected, message):
+      self.evaluate(rt.__getitem__(slice_spec))
+    with self.assertRaisesRegexp(expected, message):
+      self.evaluate(rt.__getitem__(tensor_slice_spec))
 
   @parameterized.parameters(
       # Tests for rt[i]
@@ -1225,12 +1237,84 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     self.assertEqual(rt_newaxis3.ragged_rank, 2)
     self.assertEqual(rt_newaxis4.ragged_rank, 2)
 
-    self.assertEqual(rt_newaxis0.shape.as_list(), [1, None, None, None, 2])
-    self.assertEqual(rt_newaxis1.shape.as_list(), [2, None, None, None, 2])
-    self.assertEqual(rt_newaxis2.shape.as_list(), [2, None, None, None, 2])
+    self.assertEqual(rt_newaxis0.shape.as_list(), [1, 2, None, None, 2])
+    self.assertEqual(rt_newaxis1.shape.as_list(), [2, 1, None, None, 2])
+    self.assertEqual(rt_newaxis2.shape.as_list(), [2, None, 1, None, 2])
     self.assertEqual(rt_newaxis3.shape.as_list(), [2, None, None, 1, 2])
     self.assertEqual(rt_newaxis4.shape.as_list(), [2, None, None, 2, 1])
 
+  @parameterized.parameters(
+      # EXAMPLE_RAGGED_TENSOR_3D.shape = [2, 3, None]
+
+      # Indexing into uniform_row_splits dimension:
+      (SLICE_BUILDER[:, 1], [r[1] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, None]),
+      (SLICE_BUILDER[:, 2], [r[2] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, None]),
+      (SLICE_BUILDER[:, -2], [r[-2] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, None]),
+      (SLICE_BUILDER[:, -3], [r[-3] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, None]),
+      (SLICE_BUILDER[1:, 2], [r[2] for r in EXAMPLE_RAGGED_TENSOR_3D[1:]],
+       [1, None]),
+      (SLICE_BUILDER[:, 1, 1:], [r[1][1:] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, None]),
+      (SLICE_BUILDER[1:, 1, 1:],
+       [r[1][1:] for r in EXAMPLE_RAGGED_TENSOR_3D[1:]],
+       [1, None]),
+
+      # Slicing uniform_row_splits dimension:
+      (SLICE_BUILDER[:, 2:], [r[2:] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, 1, None]),
+      (SLICE_BUILDER[:, -2:], [r[-2:] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, 2, None]),
+      (SLICE_BUILDER[:, :, 1:],
+       [[c[1:] for c in r] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, 3, None]),
+      (SLICE_BUILDER[:, 5:], [r[5:] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, 0, None]),
+
+      # Slicing uniform_row_splits dimension with a non-default step size:
+      (SLICE_BUILDER[:, ::2], [r[::2] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, 2, None]),
+      (SLICE_BUILDER[:, ::-1], [r[::-1] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, 3, None]),
+  )
+  def testRaggedTensorGetItemWithUniformRowLength(self, slice_spec, expected,
+                                                  expected_shape):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    rt = RaggedTensor.from_uniform_row_length(
+        RaggedTensor.from_row_splits(
+            EXAMPLE_RAGGED_TENSOR_3D_VALUES,
+            EXAMPLE_RAGGED_TENSOR_3D_SPLITS),
+        EXAMPLE_RAGGED_TENSOR_3D_ROWLEN)
+    self.assertAllEqual(rt, EXAMPLE_RAGGED_TENSOR_3D)
+    self.assertIsNot(rt.uniform_row_length, None)
+    self._TestGetItem(rt, slice_spec, expected, expected_shape)
+
+    # If the result is 3D, then check that it still has a uniform row length:
+    actual = rt.__getitem__(slice_spec)
+    if actual.shape.rank == 3:
+      self.assertIsNot(actual.uniform_row_length, None)
+      self.assertAllEqual(actual.uniform_row_length, expected_shape[1])
+
+  @parameterized.parameters(
+      (SLICE_BUILDER[:, 3], errors.InvalidArgumentError, 'out of bounds'),
+      (SLICE_BUILDER[:, -4], errors.InvalidArgumentError, 'out of bounds'),
+      (SLICE_BUILDER[:, 10], errors.InvalidArgumentError, 'out of bounds'),
+      (SLICE_BUILDER[:, -10], errors.InvalidArgumentError, 'out of bounds'),
+  )
+  def testRaggedTensorGetItemErrorsWithUniformRowLength(self, slice_spec,
+                                                        expected, message):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    rt = RaggedTensor.from_uniform_row_length(
+        RaggedTensor.from_row_splits(
+            EXAMPLE_RAGGED_TENSOR_3D_VALUES,
+            EXAMPLE_RAGGED_TENSOR_3D_SPLITS),
+        EXAMPLE_RAGGED_TENSOR_3D_ROWLEN)
+    self.assertAllEqual(rt, EXAMPLE_RAGGED_TENSOR_3D)
+    self._TestGetItemException(rt, slice_spec, expected, message)
+
   #=============================================================================
   # RaggedTensor.__str__
   #=============================================================================

From 19ac5f4f6c44ce98654f26c24bb8cd3971c821ab Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Tue, 18 Feb 2020 12:08:45 -0800
Subject: [PATCH 151/442] Make primary property on DistributedValue private.

PiperOrigin-RevId: 295791890
Change-Id: I5f2c80392f7a1cb2d2a9131e17d92b29124978bf
---
 .../python/distribute/cross_device_ops.py     |  2 +-
 .../distribute/mirrored_strategy_test.py      |  6 +-
 .../distribute/parameter_server_strategy.py   |  2 +-
 tensorflow/python/distribute/values.py        | 57 ++++++++++---------
 tensorflow/python/saved_model/save.py         |  2 +-
 5 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index 4b2814eca3e..9d44f5c554c 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -1032,7 +1032,7 @@ class CollectiveAllReduce(CrossDeviceOps):
           else:
             # TODO(josh11b): Once we add support for model parallelism, get the
             # copy from the corresponding replica instead of the primary.
-            index.append(array_ops.identity(all_reduced.primary))
+            index.append(array_ops.identity(all_reduced._primary))  # pylint: disable=protected-access
     return value_lib.regroup(index, wrap_class=value_lib.Mirrored)
 
   def batch_reduce_implementation(self, reduce_op, value_destination_pairs):
diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py
index b2ab4bb6ec6..fa7e4a8fcd4 100644
--- a/tensorflow/python/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_strategy_test.py
@@ -1334,7 +1334,7 @@ class FunctionTest(test.TestCase):
     def forward(x, w, b):
       return x * w + b
     x = constant_op.constant([1.0], name="x_useless")
-    concrete_forward = forward.get_concrete_function(x, w.primary, b.primary)
+    concrete_forward = forward.get_concrete_function(x, w._primary, b._primary)
 
     with ms.scope():
       def replica_fn():
@@ -1350,8 +1350,8 @@ class FunctionTest(test.TestCase):
       g1, g2 = step_fn()
       run_metadata = context.export_run_metadata()
       context.disable_run_metadata()
-      self.assertEqual(self.evaluate(g1.primary), 1.0)
-      self.assertEqual(self.evaluate(g2.primary), 1.0)
+      self.assertEqual(self.evaluate(g1._primary), 1.0)
+      self.assertEqual(self.evaluate(g2._primary), 1.0)
 
       # Verify that this node runs on both devices.
       node_name = "gradients_mul_grad_mul_1_x"
diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py
index 41ea9e3fcb9..a807d4ae9ff 100644
--- a/tensorflow/python/distribute/parameter_server_strategy.py
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@@ -487,7 +487,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
     def _select_fn(x):  # pylint: disable=g-missing-docstring
       if isinstance(x, values.Mirrored):
         if len(x.devices) == 1:
-          return x.primary
+          return x._primary  # pylint: disable=protected-access
         else:
           raise ValueError(
               "You cannot update variable with a Mirrored object with multiple "
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 570c3c35cbf..fb3e2ffd817 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -75,7 +75,7 @@ class DistributedValues(object):
         "replica accesses.")
 
   def _get_closest(self):
-    """Returns value in same replica or device if possible, else the primary."""
+    """Returns value in same replica or device if possible, else the _primary."""
     replica_id = _get_current_replica_id_as_int()
     if replica_id is None:
       # Try to find a value on the current device.
@@ -83,12 +83,12 @@ class DistributedValues(object):
       for value in self._values:
         if device_util.canonicalize(value.device) == current_device:
           return value
-      return self.primary
+      return self._primary
     else:
       return self._values[replica_id]
 
   @property
-  def primary(self):
+  def _primary(self):
     """Returns a representative component."""
     return self._values[0]
 
@@ -368,7 +368,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
   def __init__(self, strategy, values):
     self._distribute_strategy = strategy
     super(DistributedVariable, self).__init__(values)
-    self._common_name = self.primary.name.split(":")[0]
+    self._common_name = self._primary.name.split(":")[0]
     # Use a weakref to make it easy to map from the contained values
     # to the container without introducing a reference cycle.
     for v in values:
@@ -395,7 +395,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
       The op that evaluates to True or False depending on if all the
       component variables are initialized.
     """
-    result = self.primary.is_initialized()
+    result = self._primary.is_initialized()
     # We iterate through the list of values except the last one to allow us to
     # name the final `logical_and` op the same name that is passed by the user
     # to the `is_initialized` op. For distributed variables, the
@@ -426,11 +426,11 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
 
   @property
   def constraint(self):
-    return self.primary.constraint
+    return self._primary.constraint
 
   @property
   def graph(self):
-    return self.primary.graph
+    return self._primary.graph
 
   @property
   def _shared_name(self):
@@ -438,28 +438,28 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
 
   @property
   def _unique_id(self):
-    return self.primary._unique_id  # pylint: disable=protected-access
+    return self._primary._unique_id  # pylint: disable=protected-access
 
   @property
   def _graph_key(self):
     """Lets Optimizers know which graph this variable is from."""
-    return self.primary._graph_key  # pylint: disable=protected-access
+    return self._primary._graph_key  # pylint: disable=protected-access
 
   @property
   def name(self):
-    return self.primary.name
+    return self._primary.name
 
   @property
   def dtype(self):
-    return self.primary.dtype
+    return self._primary.dtype
 
   @property
   def shape(self):
-    return self.primary.shape
+    return self._primary.shape
 
   @property
   def synchronization(self):
-    return self.primary.synchronization
+    return self._primary.synchronization
 
   @property
   def handle(self):
@@ -475,10 +475,10 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
 
   @property
   def _save_slice_info(self):
-    return self.primary._save_slice_info  # pylint: disable=protected-access
+    return self._primary._save_slice_info  # pylint: disable=protected-access
 
   def _get_save_slice_info(self):
-    return self.primary._get_save_slice_info()  # pylint: disable=protected-access
+    return self._primary._get_save_slice_info()  # pylint: disable=protected-access
 
   def _set_save_slice_info(self, save_slice_info):
     for v in self._values:
@@ -490,17 +490,17 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
 
   @property
   def trainable(self):
-    return self.primary.trainable
+    return self._primary.trainable
 
   @property
   def distribute_strategy(self):
     return self._distribute_strategy
 
   def get_shape(self):
-    return self.primary.get_shape()
+    return self._primary.get_shape()
 
   def to_proto(self, export_scope=None):
-    return self.primary.to_proto(export_scope=export_scope)
+    return self._primary.to_proto(export_scope=export_scope)
 
   @property
   def op(self):
@@ -508,13 +508,13 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
     # to work (even if the current device isn't in self.devices), but
     # other uses of var.op in a cross-replica context to fail.
     if distribution_strategy_context.in_cross_replica_context():
-      return DistributedVarOp(self.primary.op.name, self.primary.op.graph,
-                              self.primary.op.traceback, self.primary.op.type)
+      return DistributedVarOp(self._primary.op.name, self._primary.op.graph,
+                              self._primary.op.traceback, self._primary.op.type)
     return self._get().op
 
   @property
   def _in_graph_mode(self):
-    return self.primary._in_graph_mode  # pylint: disable=protected-access
+    return self._primary._in_graph_mode  # pylint: disable=protected-access
 
   def read_value(self):
     with _enter_or_assert_strategy(self._distribute_strategy):
@@ -567,7 +567,7 @@ class TPUVariableMixin(object):
     # Handle ID is needed for `get_replicated_var_handle` to cache the variables
     # correctly since in eager mode different variables can have the same name.
     if ops.executing_eagerly_outside_functions():
-      self._handle_id = self._common_name + "_" + str(id(self.primary))
+      self._handle_id = self._common_name + "_" + str(id(self._primary))
     else:
       self._handle_id = self._common_name
 
@@ -592,7 +592,7 @@ class TPUVariableMixin(object):
     if _enclosing_tpu_context() is None:
       return super(TPUVariableMixin, self)._get_closest()
     else:
-      return self.primary
+      return self._primary
 
   def numpy(self):
     if context.executing_eagerly():
@@ -644,8 +644,8 @@ class TPUVariableMixin(object):
 
   @property
   def op(self):
-    return DistributedVarOp(self.primary.op.name, self.primary.op.graph,
-                            self.primary.op.traceback, self.primary.op.type)
+    return DistributedVarOp(self._primary.op.name, self._primary.op.graph,
+                            self._primary.op.traceback, self._primary.op.type)
 
   def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
     """Converts a variable to a tensor."""
@@ -900,7 +900,7 @@ class MirroredVariable(DistributedVariable, Mirrored):
     """
 
     def _saveable_factory(name=self._common_name):
-      return _MirroredSaveable(self, self.primary, name)
+      return _MirroredSaveable(self, self._primary, name)
 
     return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
 
@@ -1003,7 +1003,8 @@ class _SyncOnReadSaveable(saver.BaseSaverBuilder.SaveableObject):
         slice_spec="",
         name=name,
         dtype=sync_on_read_variable.dtype,
-        device=sync_on_read_variable.primary.device)
+        device=sync_on_read_variable._primary.device)  # pylint: disable=protected-access
+
     super(_SyncOnReadSaveable, self).__init__(tensor, [spec], name)
 
   def restore(self, restored_tensors, restored_shapes):
@@ -1103,7 +1104,7 @@ class SyncOnReadVariable(DistributedVariable):
 
   def _get_cross_replica(self):
     if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-      return self.primary
+      return self._primary
 
     with _enter_or_assert_strategy(self._distribute_strategy):
       return self._distribute_strategy.reduce(
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index 617f5e83a01..ced4135526a 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -274,7 +274,7 @@ class _SaveableView(object):
         self.captured_tensor_node_ids[obj.resource_handle] = node_id
       elif (ds_values.is_distributed_variable(obj) or
             resource_variable_ops.is_resource_variable(obj)):
-        obj_to_copy = obj.primary if ds_values.is_distributed_variable(
+        obj_to_copy = obj._primary if ds_values.is_distributed_variable(  # pylint: disable=protected-access
             obj) else obj
         new_variable = resource_variable_ops.copy_to_graph_uninitialized(
             obj_to_copy)

From caad1b7a45c593e83adbc2df0f099e783aff48e8 Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <prakalps@google.com>
Date: Tue, 18 Feb 2020 12:10:36 -0800
Subject: [PATCH 152/442] Add import support for HLO Scatter op.

PiperOrigin-RevId: 295792321
Change-Id: I6daf2b0b49d551a446d6e37b9e6f96fbd11fdbfa
---
 .../mlir/xla/hlo_function_importer.cc         | 32 +++++++++++++++++++
 .../compiler/mlir/xla/hlo_function_importer.h |  4 +++
 .../mlir/xla/tests/translate/import.hlotxt    | 31 ++++++++++++++++++
 3 files changed, 67 insertions(+)

diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
index 6081f2e1461..bc9bdf49a39 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
@@ -370,6 +370,22 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
                                          Convert(interior_padding))
           .getOperation();
     }
+    case HloOpcode::kScatter: {
+      auto scatter = static_cast<HloScatterInstruction*>(instruction);
+      attributes.push_back(
+          ConvertScatterDimensionNumbers(scatter->scatter_dimension_numbers()));
+      attributes.push_back(builder_->getNamedAttr(
+          "indices_are_sorted",
+          builder_->getBoolAttr(scatter->indices_are_sorted())));
+      attributes.push_back(builder_->getNamedAttr(
+          "unique_indices", builder_->getBoolAttr(scatter->unique_indices())));
+
+      auto scatter_op = func_builder->create<mlir::xla_hlo::ScatterOp>(
+          loc, result_type, operands, attributes);
+      TF_RETURN_IF_ERROR(ImportComputation(scatter->to_apply(),
+                                           &scatter_op.update_computation()));
+      return scatter_op.getOperation();
+    }
     case HloOpcode::kSetDimensionSize: {
       attributes.push_back(builder_->getNamedAttr(
           "dimension", builder_->getIntegerAttr(builder_->getIntegerType(32),
@@ -844,6 +860,22 @@ mlir::NamedAttribute HloFunctionImporter::ConvertGatherDimensionNumbers(
   return builder_->getNamedAttr("dimension_numbers", attr);
 }
 
+mlir::NamedAttribute HloFunctionImporter::ConvertScatterDimensionNumbers(
+    const xla::ScatterDimensionNumbers& dnums) {
+  std::vector<int64_t> update_window_dims(dnums.update_window_dims().begin(),
+                                          dnums.update_window_dims().end());
+  std::vector<int64_t> inserted_window_dims(
+      dnums.inserted_window_dims().begin(), dnums.inserted_window_dims().end());
+  std::vector<int64_t> scatter_dims_to_operand_dims(
+      dnums.scatter_dims_to_operand_dims().begin(),
+      dnums.scatter_dims_to_operand_dims().end());
+  auto attr = mlir::xla_hlo::ScatterDimensionNumbers::get(
+      Convert(update_window_dims), Convert(inserted_window_dims),
+      Convert(scatter_dims_to_operand_dims),
+      builder_->getI64IntegerAttr(dnums.index_vector_dim()), context_);
+  return builder_->getNamedAttr("scatter_dimension_numbers", attr);
+}
+
 mlir::NamedAttribute HloFunctionImporter::ConvertSourceTargetPairs(
     const std::vector<std::pair<tensorflow::int64, tensorflow::int64>>&
         source_target_pairs) {
diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.h b/tensorflow/compiler/mlir/xla/hlo_function_importer.h
index d373e88e1c0..93c8e6e818c 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.h
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.h
@@ -121,6 +121,10 @@ class HloFunctionImporter {
   mlir::NamedAttribute ConvertGatherDimensionNumbers(
       const xla::GatherDimensionNumbers& dnums);
 
+  // Converts the scatter dimensions to attributes.
+  mlir::NamedAttribute ConvertScatterDimensionNumbers(
+      const xla::ScatterDimensionNumbers& dnums);
+
   // Converts XLA instruction source target pairs to MLIR attribute.
   mlir::NamedAttribute ConvertSourceTargetPairs(
       const std::vector<std::pair<tensorflow::int64, tensorflow::int64>>&
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
index a02db66cd47..b2dec8c950f 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
@@ -716,6 +716,37 @@ ENTRY %dummy_main (Arg_0.1: f32[]) -> f32[] {
   ROOT %Arg_0.1 = f32[] parameter(0)
 }
 
+// Test scatter
+%update_computation {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %sum = f32[] add(f32[] %lhs, f32[] %rhs)
+}
+
+%test_scatter {
+  %input_tensor = f32[200,100,300] parameter(0)
+  %scatter_indices = s64[10,2] parameter(1)
+  %updates = f32[10,300] parameter(2)
+  ROOT %scatter = f32[200,100,300] scatter(f32[200,100,300] %input_tensor, s64[10,2] %scatter_indices, f32[10,300] %updates), update_window_dims={1}, inserted_window_dims={0,1}, scatter_dims_to_operand_dims={0,1}, index_vector_dim=1, to_apply=%update_computation
+}
+
+// CHECK-LABEL:  func @test_scatter
+// CHECK-SAME:   [[ARG_0:%.*]]: tensor<200x100x300xf32>, [[ARG_1:%.*]]: tensor<10x2xi64>, [[ARG_2:%.*]]: tensor<10x300xf32>) -> tensor<200x100x300xf32>
+// CHECK:  "xla_hlo.scatter"([[ARG_0]], [[ARG_1]], [[ARG_2]]) ( {
+// CHECK:    ^bb0([[LHS:%.*]]: tensor<f32>, [[RHS:%.*]]: tensor<f32>):
+// CHECK:      [[ADD:%.*]] = xla_hlo.add [[LHS]], [[RHS]]
+// CHECK:      "xla_hlo.return"([[ADD]]) : (tensor<f32>) -> ()
+// CHECK:    })
+// CHECK-SAME:  indices_are_sorted = false
+// CHECK-SAME:  scatter_dimension_numbers = {
+// CHECK-SAME:    index_vector_dim = 1 : i64
+// CHECK-SAME:    inserted_window_dims = dense<[0, 1]> : tensor<2xi64>
+// CHECK-SAME:    scatter_dims_to_operand_dims = dense<[0, 1]> : tensor<2xi64>
+// CHECK-SAME:    update_window_dims = dense<1> : tensor<1xi64>
+// CHECK-SAME:  }
+// CHECK-SAME:  unique_indices = false
+
+
 // CHECK-LABEL:  func @test_select(%arg0: tensor<2x3xi1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
 %test_select {
   %Arg_0.1 = pred[2,3] parameter(0)

From 11b27dd35a8f9da69f9b5f67bc4431fd04a92334 Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Tue, 18 Feb 2020 12:16:05 -0800
Subject: [PATCH 153/442] [MLIR:TF/XLA] Resource lifting for
 PartitionedCallOp/StatefulPartitionedCallOp

If a called function involves resources, clone it then lift the resource ops outside. Multiple call sites will share the same lifted callee function.

PiperOrigin-RevId: 295793372
Change-Id: I39b00dab43815216a5fa5b2d594f3d391f871290
---
 .../tensorflow/tests/resource_op_lifting.mlir | 113 +++++++++
 .../transforms/resource_op_lifting.cc         | 215 +++++++++++++++++-
 2 files changed, 319 insertions(+), 9 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
index 016b06b662a..52bc0f878fc 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
@@ -542,3 +542,116 @@ func @if_else(%arg0: tensor<*x!tf.resource<tensor<4xf32>>>, %arg1: tensor<*x!tf.
     -> (tensor<*x!tf.resource<tensor<4xf32>>>) {
   return %arg1 : tensor<*x!tf.resource<tensor<4xf32>>>
 }
+
+// -----
+
+// Tests that the pass lifts resources on two partitioned call ops sharing the
+// same callee. The lifting should clone the callee then modify the clone.
+
+// CHECK-LABEL: @launch_with_partitioned_call
+func @launch_with_partitioned_call() -> tensor<f32> {
+  // CHECK: %[[VH:.*]] = "tf.VarHandleOp"()
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
+  // CHECK: %[[CONST:.*]] = "tf.Const"()
+  %1 = "tf.Const"() {value = dense<10.0> : tensor<f32>} : () -> tensor<f32>
+  // CHECK: %[[READ:.*]] = "tf.ReadVariableOp"(%[[VH]])
+  // CHECK: %[[LAUNCH:.*]] = "tf_device.launch"()
+  %2 = "tf_device.launch"() ( {
+    // CHECK: %[[PC0:.*]] = "tf.PartitionedCall"(%[[CONST]], %[[READ]], %[[CONST]])
+    // CHECK-SAME: f = @callee_resource_lifted
+    %3 = "tf.PartitionedCall"(%1, %0, %1) {f = @callee, config = "", config_proto = "", executor_type = ""}
+      : (tensor<f32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> tensor<f32>
+    // CHECK: %[[PC1:.*]] = "tf.PartitionedCall"(%[[CONST]], %[[READ]], %[[CONST]])
+    // CHECK-SAME: f = @callee_resource_lifted
+    %4 = "tf.PartitionedCall"(%1, %0, %1) {f = @callee, config = "", config_proto = "", executor_type = ""}
+      : (tensor<f32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> tensor<f32>
+    // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[PC0]], %[[PC1]])
+    %5 = "tf.AddV2"(%3, %4) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    // CHECK: tf_device.return %[[ADD]] : tensor<f32>
+    tf_device.return %5 : tensor<f32>
+  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<f32>
+  return %2 : tensor<f32>
+}
+// CHECK: @callee(%[[OA0:.*]]: tensor<f32>, %[[OA1:.*]]: tensor<*x!tf.resource<tensor<f32>>>, %[[OA2:.*]]: tensor<f32>) -> tensor<f32>
+func @callee(%arg0: tensor<f32>, %arg1: tensor<*x!tf.resource<tensor<f32>>>, %arg2: tensor<f32>) -> tensor<f32> {
+  // CHECK: "tf.ReadVariableOp"(%[[OA1]])
+  %0 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32>
+  %1 = "tf.AddV2"(%0, %arg0) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  %2 = "tf.AddV2"(%1, %arg2) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  return %2 : tensor<f32>
+}
+// CHECK: func @callee_resource_lifted(%[[A0:.*]]: tensor<f32>, %[[A1:.*]]: tensor<f32>, %[[A2:.*]]: tensor<f32>) -> tensor<f32>
+// CHECK-NEXT:   %[[ADD0:.*]] = "tf.AddV2"(%[[A1]], %[[A0]])
+// CHECK-NEXT:   %[[ADD1:.*]] = "tf.AddV2"(%[[ADD0]], %[[A2]])
+// CHECK-NEXT:   return %[[ADD1]]
+
+
+// -----
+
+// Tests that the pass lifts resources on two stateful partitioned call ops
+// sharing the same callee. The lifting should clone the callee then modify the
+// clone.
+
+// CHECK-LABEL: @launch_with_stateful_partitioned_call
+func @launch_with_stateful_partitioned_call() -> () {
+  // CHECK: %[[VH0:.*]] = "tf.VarHandleOp"()
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
+  // CHECK: %[[VH1:.*]] = "tf.VarHandleOp"()
+  %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<f32>>>
+  // CHECK: %[[CONST:.*]] = "tf.Const"()
+  %2 = "tf.Const"() {value = dense<10.0> : tensor<f32>} : () -> tensor<f32>
+  // CHECK-DAG: %[[READ0:.*]] = "tf.ReadVariableOp"(%[[VH0]])
+  // CHECK-DAG: %[[READ1:.*]] = "tf.ReadVariableOp"(%[[VH1]])
+  // CHECK: %[[LAUNCH:.*]] = "tf_device.launch"()
+  "tf_device.launch"() ( {
+    // CHECK: %[[PC0:.*]] = "tf.StatefulPartitionedCall"(%[[READ0]], %[[READ1]], %[[CONST]])
+    // CHECK-SAME: f = @callee_resource_lifted
+    %3 = "tf.StatefulPartitionedCall"(%0, %1, %2) {f = @callee, config = "", config_proto = "", executor_type = ""}
+      : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> tensor<*x!tf.resource<tensor<f32>>>
+    // CHECK: %[[PC1:.*]] = "tf.StatefulPartitionedCall"(%[[PC0]], %[[READ1]], %[[CONST]])
+    // CHECK-SAME: f = @callee_resource_lifted
+    %4 = "tf.StatefulPartitionedCall"(%3, %1, %2) {f = @callee, config = "", config_proto = "", executor_type = ""}
+      : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> tensor<*x!tf.resource<tensor<f32>>>
+    // CHECK: tf_device.return %[[PC1]] : tensor<f32>
+    tf_device.return
+    // CHECK: {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<f32>
+  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
+  // CHECK: "tf.AssignVariableOp"(%[[VH0]], %[[LAUNCH]])
+  return
+}
+// CHECK: @callee(%[[OA0:.*]]: tensor<*x!tf.resource<tensor<f32>>>, %[[OA1:.*]]: tensor<*x!tf.resource<tensor<f32>>>, %[[OA2:.*]]: tensor<f32>) -> tensor<*x!tf.resource<tensor<f32>>>
+func @callee(%arg0: tensor<*x!tf.resource<tensor<f32>>>, %arg1: tensor<*x!tf.resource<tensor<f32>>>, %arg2: tensor<f32>) -> tensor<*x!tf.resource<tensor<f32>>> {
+  // CHECK: "tf.ReadVariableOp"(%[[OA1]])
+  %0 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32>
+  %1 = "tf.AddV2"(%0, %arg2) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  "tf.AssignVariableOp"(%arg0, %1) {dtype = i32} : (tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+  return %arg0 : tensor<*x!tf.resource<tensor<f32>>>
+}
+// CHECK: func @callee_resource_lifted(%[[A0:.*]]: tensor<f32>, %[[A1:.*]]: tensor<f32>, %[[A2:.*]]: tensor<f32>) -> tensor<f32>
+// CHECK-NEXT:   %[[ADD:.*]] = "tf.AddV2"(%[[A1]], %[[A2]])
+// CHECK-NEXT:   return %[[ADD]]
+
+
+// -----
+
+// Tests that the pass reports error on called function that has resource output
+// which doesn't alias an input.
+
+func @launch_with_stateful_partitioned_call() -> () {
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
+  %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<f32>>>
+  %2 = "tf.Const"() {value = dense<10.0> : tensor<f32>} : () -> tensor<f32>
+  "tf_device.launch"() ( {
+    %3 = "tf.StatefulPartitionedCall"(%0, %1, %2) {f = @callee, config = "", config_proto = "", executor_type = ""}
+      : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> tensor<*x!tf.resource<tensor<f32>>>
+    %4 = "tf.StatefulPartitionedCall"(%3, %1, %2) {f = @callee, config = "", config_proto = "", executor_type = ""}
+      : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> tensor<*x!tf.resource<tensor<f32>>>
+    tf_device.return
+  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
+  return
+}
+// expected-error @+1 {{Unsupported function call: resource return value does not alias an input.}}
+func @callee(%arg0: tensor<*x!tf.resource<tensor<f32>>>, %arg1: tensor<*x!tf.resource<tensor<f32>>>, %arg2: tensor<f32>) -> tensor<*x!tf.resource<tensor<f32>>> {
+  %0 = "tf._Unknown_"() : () -> tensor<*x!tf.resource<tensor<f32>>>
+  return %0 : tensor<*x!tf.resource<tensor<f32>>>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
index 7f0b1b96560..8dc21feca90 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "mlir/IR/Function.h"  // TF:llvm-project
 #include "mlir/IR/Module.h"  // TF:llvm-project
 #include "mlir/IR/StandardTypes.h"  // TF:llvm-project
+#include "mlir/IR/SymbolTable.h"  // TF:llvm-project
 #include "mlir/IR/TypeUtilities.h"  // TF:llvm-project
 #include "mlir/IR/Types.h"  // TF:llvm-project
 #include "mlir/IR/Value.h"  // TF:llvm-project
@@ -811,16 +812,185 @@ LogicalResult HanldeIfOP(TF::IfOp if_op, FuncOp then_branch,
   return success();
 }
 
+// A resource-lifted function for (potentially multiple) PartitionedCallOps and
+// information about the lifting changes.
+struct PartitionedCallLiftingInfo {
+  // Function with resources lifted. Can be nullptr if nothing needs to change.
+  FuncOp lifted_callee;
+  // Mapping from old resource outputs to their aliasing output inputs.
+  llvm::SmallDenseMap<int64_t, int64_t> old_outputs_aliasing_old_inputs;
+  // Mapping from old to new output indices in case any output is removed.
+  llvm::SmallVector<int64_t, 4> old_to_new_output_indices;
+  // ResourceArgUseInfo for each old resource argument.
+  llvm::SmallDenseMap<int64_t, ResourceArgUseInfo> use_info;
+  // Input for AddLoadsStoresOutsideControlFlowOp(), see its comment.
+  llvm::SmallDenseMap<int64_t, std::pair<Type, int64_t>>
+      arg_data_type_and_updated_output_index;
+};
+
+// Lifts loads/stores from a PartitionedCallOp's callee function. If anything
+// needs to be changed, the original function will be preserved, and the lifting
+// happens on a clone, which will be stored in `result`.
+LogicalResult HandlePartitionedCallOpCallee(
+    FuncOp callee, PartitionedCallLiftingInfo* result) {
+  // Remove identity nodes to avoid aliasing.
+  RemoveIdentity(&callee.front());
+  // Sanity check: return of resources should be aliases of inputs. Such outputs
+  // will be removed later.
+  int64_t non_resource_results = 0;
+  for (auto entry :
+       llvm::enumerate(callee.front().getTerminator()->getOperands())) {
+    auto retval = entry.value();
+    if (!getElementTypeOrSelf(retval.getType()).isa<TF::ResourceType>()) {
+      result->old_to_new_output_indices.push_back(non_resource_results++);
+      continue;
+    }
+    auto aliasing_arg = retval.dyn_cast<BlockArgument>();
+    if (!aliasing_arg) {
+      return callee.emitOpError(
+          "Unsupported function call: resource return value does not alias an "
+          "input.");
+    }
+    result->old_outputs_aliasing_old_inputs[entry.index()] =
+        aliasing_arg.getArgNumber();
+    result->old_to_new_output_indices.push_back(-1);
+  }
+
+  if (failed(FindResourceArgUseInfo(callee, &result->use_info))) {
+    return failure();
+  }
+  if (result->use_info.empty()) {
+    result->lifted_callee = nullptr;
+    return success();
+  }
+
+  // Clone the callee before making changes.
+  SmallString<64> name_base = callee.getName();
+  auto module = callee.getParentOfType<ModuleOp>();
+  name_base += "_resource_lifted";
+  auto name = name_base;
+  {
+    int64_t counter = 0;
+    while (module.lookupSymbol(name)) {
+      auto name = name_base;
+      name += "_" + std::to_string(counter++);
+    }
+  }
+  callee = callee.clone();
+  callee.setName(name);
+  SymbolTable(module).insert(callee);
+  result->lifted_callee = callee;
+
+  // Remove unused resources in functions.
+  llvm::SmallDenseMap<int64_t, Type> remaining_resource_data_types;
+  RemoveUnusedResourceArgumentsAndForwardedRetvals(
+      result->use_info, callee, /*old_to_new_arg_indices=*/nullptr,
+      &remaining_resource_data_types);
+  for (const auto& entry : remaining_resource_data_types) {
+    result->arg_data_type_and_updated_output_index[entry.getFirst()] = {
+        entry.getSecond(), -1};
+  }
+  llvm::SmallVector<Value, 4> new_retvals;
+  for (auto val : callee.front().getTerminator()->getOperands()) {
+    // Remove resource type outputs.
+    if (getElementTypeOrSelf(val.getType()).isa<TF::ResourceType>()) continue;
+    new_retvals.push_back(val);
+  }
+  // Lift resources.
+  LiftArgRetResourcesForFunction(
+      callee, remaining_resource_data_types, [&](int64_t index, Value value) {
+        result->arg_data_type_and_updated_output_index[index].second =
+            new_retvals.size();
+        new_retvals.push_back(value);
+      });
+  auto old_return = callee.front().getTerminator();
+  // Replace old return with the new ones with update values.
+  OpBuilder builder(old_return);
+  auto new_return = builder.create<ReturnOp>(old_return->getLoc(), new_retvals);
+  old_return->erase();
+  callee.setType(FunctionType::get(
+      callee.getType().getInputs(),
+      llvm::to_vector<4>(new_return.getOperandTypes()), callee.getContext()));
+  return success();
+}
+
+// Updates a PartitionedCallOp/StatefulPartitionedCallOp according to the
+// resource-lifted new callee function in lifting_info.
+template <typename CallOpType>
+void UpdatePartitionedCallOpWithNewCallee(
+    CallOpType call_op, const PartitionedCallLiftingInfo& lifting_info) {
+  if (lifting_info.lifted_callee == nullptr) return;
+  // Replace output resource uses with the aliasing input, so that we can remove
+  // this output.
+  for (const auto& entry : lifting_info.old_outputs_aliasing_old_inputs) {
+    call_op.getResult(entry.getFirst())
+        .replaceAllUsesWith(call_op.getOperand(entry.getSecond()));
+  }
+  // Recreate the call op.
+  OpBuilder builder(call_op);
+  // Now use the filtered original operands, which will be replaced by
+  // AddLoadsStoresOutsideControlFlowOp().
+  auto new_operands =
+      FilterRange<Value, OperandRange>(call_op.args(), lifting_info.use_info);
+  auto new_call = builder.create<CallOpType>(
+      call_op.getLoc(),
+      const_cast<FuncOp&>(lifting_info.lifted_callee).getType().getResults(),
+      new_operands, call_op.getAttrs());
+  new_call.setAttr(
+      "f", builder.getSymbolRefAttr(
+               const_cast<FuncOp&>(lifting_info.lifted_callee).getName()));
+  AddLoadsStoresOutsideControlFlowOp(
+      new_call, lifting_info.arg_data_type_and_updated_output_index);
+  // Replace uses.
+  for (int64_t i = 0; i < lifting_info.old_to_new_output_indices.size(); ++i) {
+    if (lifting_info.old_to_new_output_indices[i] >= 0) {
+      call_op.getResult(i).replaceAllUsesWith(
+          new_call.getResult(lifting_info.old_to_new_output_indices[i]));
+    }
+  }
+  call_op.erase();
+}
+
+LogicalResult HoistForFunctionalControlFlow(
+    Block*, ModuleOp, llvm::SmallDenseMap<FuncOp, PartitionedCallLiftingInfo>*);
+
+// A templated routine for handling both PartitionedCallOp and
+// StatefulPartitionedCallOp. If the callee is already lifted, it just updates
+// the caller op itself; otherwise, it first recursively handles nested control
+// flow, then performs lifting on the callee.
+template <typename CallOpType>
+LogicalResult HandlePartitionedCallOp(
+    CallOpType call_op, FuncOp callee, ModuleOp module,
+    llvm::SmallDenseMap<FuncOp, PartitionedCallLiftingInfo>* lifted_callees) {
+  auto emplace_res =
+      lifted_callees->try_emplace(callee, PartitionedCallLiftingInfo());
+  if (emplace_res.second) {
+    // Unseen callee. Perform resource lifting on it.
+    HoistForFunctionalControlFlow(&callee.front(), module, lifted_callees);
+    if (failed(HandlePartitionedCallOpCallee(
+            callee, &emplace_res.first->getSecond()))) {
+      return failure();
+    }
+  }
+  UpdatePartitionedCallOpWithNewCallee(call_op, emplace_res.first->getSecond());
+  return success();
+}
+
 // Hoists resource loads/stores from control flow ops in `block` outside the
-// body/cond/branch functions.
-LogicalResult HoistForFunctionalControlFlow(Block* block, ModuleOp module) {
+// body/cond/branch/callee functions.
+LogicalResult HoistForFunctionalControlFlow(
+    Block* block, ModuleOp module,
+    llvm::SmallDenseMap<FuncOp, PartitionedCallLiftingInfo>*
+        lifted_partitioned_call_callees) {
   for (Operation& op : llvm::make_early_inc_range(*block)) {
     if (auto while_op = llvm::dyn_cast<TF::WhileOp>(&op)) {
       auto body = llvm::cast<FuncOp>(module.lookupSymbol(while_op.body()));
       auto cond = llvm::cast<FuncOp>(module.lookupSymbol(while_op.cond()));
       // Recursively handle the nested control flow.
-      HoistForFunctionalControlFlow(&body.front(), module);
-      HoistForFunctionalControlFlow(&cond.front(), module);
+      HoistForFunctionalControlFlow(&body.front(), module,
+                                    lifted_partitioned_call_callees);
+      HoistForFunctionalControlFlow(&cond.front(), module,
+                                    lifted_partitioned_call_callees);
       if (failed(HanldeWhileLoop(while_op, body, cond))) return failure();
     } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(&op)) {
       auto then_branch =
@@ -828,9 +998,30 @@ LogicalResult HoistForFunctionalControlFlow(Block* block, ModuleOp module) {
       auto else_branch =
           llvm::cast<FuncOp>(module.lookupSymbol(if_op.else_branch()));
       // Recursively handle the nested control flow.
-      HoistForFunctionalControlFlow(&then_branch.front(), module);
-      HoistForFunctionalControlFlow(&else_branch.front(), module);
+      HoistForFunctionalControlFlow(&then_branch.front(), module,
+                                    lifted_partitioned_call_callees);
+      HoistForFunctionalControlFlow(&else_branch.front(), module,
+                                    lifted_partitioned_call_callees);
       if (failed(HanldeIfOP(if_op, then_branch, else_branch))) return failure();
+    } else if (auto call_op = llvm::dyn_cast<TF::PartitionedCallOp>(&op)) {
+      if (!call_op.f().isa<FlatSymbolRefAttr>()) {
+        return call_op.emitError(
+            "Resource lifting does not support call with nested references.");
+      }
+      auto callee = llvm::cast<FuncOp>(
+          module.lookupSymbol(call_op.f().getRootReference()));
+      if (failed(HandlePartitionedCallOp(call_op, callee, module,
+                                         lifted_partitioned_call_callees))) {
+        // Nested control flow handling is done in HandlePartitionedCallOp().
+        return failure();
+      }
+    } else if (auto call_op =
+                   llvm::dyn_cast<TF::StatefulPartitionedCallOp>(&op)) {
+      auto callee = llvm::cast<FuncOp>(module.lookupSymbol(call_op.f()));
+      if (failed(HandlePartitionedCallOp(call_op, callee, module,
+                                         lifted_partitioned_call_callees))) {
+        return failure();
+      }
     }
   }
   return success();
@@ -840,10 +1031,13 @@ LogicalResult HoistForFunctionalControlFlow(Block* block, ModuleOp module) {
 // outside. Returns failure if there are remaining resource-type values that can
 // not be lifted.
 void ResourceOpLiftingPass::runOnModule() {
+  llvm::SmallDenseMap<FuncOp, PartitionedCallLiftingInfo>
+      lifted_partitioned_call_callees;
   auto result = getModule().walk([&](FuncOp func_op) {
     return func_op.walk([&](tf_device::LaunchOp launch_op) {
-      if (failed(HoistForFunctionalControlFlow(&launch_op.GetBody(),
-                                               getModule())) ||
+      if (failed(HoistForFunctionalControlFlow(
+              &launch_op.GetBody(), getModule(),
+              &lifted_partitioned_call_callees)) ||
           failed(HoistResourceOpsFromLaunchOp(launch_op))) {
         return WalkResult::interrupt();
       }
@@ -901,8 +1095,11 @@ LogicalResult ResourceLiftingForFunctionalControlFlow(FuncOp function) {
            << function.getBlocks().size();
   }
 
+  llvm::SmallDenseMap<FuncOp, PartitionedCallLiftingInfo>
+      lifted_partitioned_call_callees;
   return HoistForFunctionalControlFlow(&function.front(),
-                                       cast<ModuleOp>(function.getParentOp()));
+                                       cast<ModuleOp>(function.getParentOp()),
+                                       &lifted_partitioned_call_callees);
 }
 }  // namespace TF
 

From 0151f021aece5f85ef41f826372c164f5ac1c998 Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Tue, 18 Feb 2020 12:17:40 -0800
Subject: [PATCH 154/442] [TF:XLA] Avoid lowering vector data formats in
 Maxpool through XLA.

XLA doesn't handle these formats now, so leave them to Tensorflow to run optimized kernels on.

PiperOrigin-RevId: 295793708
Change-Id: I299abebb7abd05d72b0c9d2eeea0bef20f382ce2
---
 tensorflow/compiler/tf2xla/kernels/pooling_ops.cc  | 9 +++++++++
 tensorflow/python/kernel_tests/pooling_ops_test.py | 4 ++++
 2 files changed, 13 insertions(+)

diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index 67d49eafcde..5f5cae8f176 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -32,6 +32,8 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/pooling_ops_common.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
 namespace {
@@ -157,6 +159,13 @@ class MaxPoolOp : public PoolingOp {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
     OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES(
+        ctx,
+        data_format_ != FORMAT_NCHW_VECT_C &&
+            data_format_ != FORMAT_NHWC_VECT_W,
+        errors::Unimplemented("XLA does not support the VECT_* data formats. "
+                              "Returning unimplemented from MaxPool to keep "
+                              "Tensorflow's intended optimized MaxPool here."));
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index 2e47c50acef..c9b1e42d66b 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -605,6 +605,10 @@ class PoolingTest(test.TestCase):
         use_gpu=use_gpu)
 
   @test_util.run_deprecated_v1
+  @test_util.xla_allow_fallback(
+      "Allow VECT_* data formats on newer hardware versions which XLA does not"
+      " handle."
+  )
   def testMaxPooling(self):
     for use_gpu in True, False:
       self._testMaxPoolValidPadding(use_gpu)

From 1b8ecff1856c80905fc395cebabc6d0641fce017 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Tue, 18 Feb 2020 12:21:21 -0800
Subject: [PATCH 155/442] Return a meaningful error message when control flow
 encounters `None` values.

PiperOrigin-RevId: 295794428
Change-Id: Ib60a31604bbca700898cf5efa25f4cf52de69440
---
 .../autograph/operators/control_flow.py       |  41 ++++---
 .../autograph/operators/control_flow_test.py  | 113 ++++++++++++++++++
 2 files changed, 137 insertions(+), 17 deletions(-)

diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index 15cf53de8aa..5b2380827b1 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -104,22 +104,19 @@ INEFFICIENT_UNROLL_MIN_OPS = 1
 # datasets. Before it can be used though, we need to standardize the interface.
 
 
-# TODO(mdan): Use existing symbol names rather than carrying them separately.
-def _disallow_undefs_into_loop(*values):
+def _verify_loop_init_vars(values, symbol_names):
   """Ensures that all values in the state are defined when entering a loop."""
-  undefined = tuple(filter(special_values.is_undefined, values))
-  if undefined:
-    raise ValueError(
-        '{} must be defined before the loop.'.format(
-            ','.join(s.symbol_name for s in undefined)))
-
-  for value in values:
+  for name, value in zip(symbol_names, values):
+    if value is None:
+      raise ValueError('"{}" may not be None before the loop.'.format(name))
     if special_values.is_undefined_return(value):
       # Assumption: the loop will only capture the variable which tracks the
       # return value if the loop contained a return statement.
       # TODO(mdan): This should be checked at the place where return occurs.
       raise ValueError(
           'return statements are not supported within a TensorFlow loop.')
+    if special_values.is_undefined(value):
+      raise ValueError('"{}" must be defined before the loop.'.format(name))
 
 
 def _is_subshape(left, right):
@@ -142,11 +139,15 @@ def _is_subshape(left, right):
 def _verify_single_loop_var(
     name, check_shape, init, entry, exit_, shape_invariant):
   """Verifies whether the initial, entry and exit values are consistent."""
+  assert entry is not None, 'no TF op should set "{}" to None?'.format(name)
+  if exit_ is None:
+    raise ValueError('"{}" is None at the end of the iteration.'.format(name))
+
   if isinstance(init, (bool, int, float, str, np.ndarray)):
     init = ops.convert_to_tensor_v2(init)
   if isinstance(entry, (bool, int, float, str, np.ndarray)):
     entry = ops.convert_to_tensor_v2(entry)
-  if isinstance(exit_, (bool, int, float, str)):
+  if isinstance(exit_, (bool, int, float, str, np.ndarray)):
     exit_ = ops.convert_to_tensor_v2(exit_)
 
   if (not tensor_util.is_tensor(entry) or
@@ -237,10 +238,16 @@ def _verify_tf_loop_vars(init_vars,
 
 def _verify_single_cond_var(name, body_var, orelse_var):
   """Verifies whether body_var and orelse_var are consistent."""
-  if isinstance(body_var, (bool, int, float, str)):
+  if body_var is None:
+    raise ValueError('"{}" is None at the end of the TRUE branch.'.format(name))
+  if orelse_var is None:
+    raise ValueError(
+        '"{}" is None at the end of the FALSE branch.'.format(name))
+
+  if isinstance(body_var, (bool, int, float, str, np.ndarray)):
     body_var = ops.convert_to_tensor_v2(body_var)
 
-  if isinstance(orelse_var, (bool, int, float, str)):
+  if isinstance(orelse_var, (bool, int, float, str, np.ndarray)):
     orelse_var = ops.convert_to_tensor_v2(orelse_var)
 
   if (not tensor_util.is_tensor(body_var) or
@@ -443,7 +450,7 @@ def _tf_ragged_for_stmt(
     iter_, extra_test, body, get_state, set_state, symbol_names, opts):
   """Overload of for_stmt that iterates over TF ragged tensors."""
   init_vars = get_state()
-  _disallow_undefs_into_loop(*init_vars)
+  _verify_loop_init_vars(init_vars, symbol_names)
 
   # TODO(mdan): Move this into len()? Requires eager support.
   if iter_.shape and iter_.shape[0] is not None:
@@ -540,7 +547,7 @@ def _tf_iterator_for_stmt(
     set_state(loop_vars)
 
   init_vars = aug_get_state()
-  _disallow_undefs_into_loop(*init_vars)
+  _verify_loop_init_vars(init_vars, symbol_names)
 
   def aug_body():
     """Main body passed to _tf_while_stmt."""
@@ -612,7 +619,7 @@ def _tf_dataset_for_stmt(
   #  reduce(take_while(scan(3)))
 
   init_vars = get_state()
-  _disallow_undefs_into_loop(*init_vars)
+  _verify_loop_init_vars(init_vars, symbol_names)
 
   # Workaround for Dataset.reduce not allowing empty state tensors - create
   # a dummy state variable that remains unused.
@@ -680,7 +687,7 @@ def _tf_distributed_iterable_for_stmt(
         'for ... in distributed input loops.')
 
   init_vars = get_state()
-  _disallow_undefs_into_loop(init_vars)
+  _verify_loop_init_vars(init_vars, symbol_names)
 
   if 'shape_invariants' in opts:
     opts['shape_invariants'] = _shape_invariants_mapping_to_positional_list(
@@ -852,7 +859,7 @@ def _shape_invariants_mapping_to_positional_list(mapping, keys):
 def _tf_while_stmt(test, body, get_state, set_state, symbol_names, opts):
   """Overload of while_stmt that stages a TF while_stmt."""
   init_vars = get_state()
-  _disallow_undefs_into_loop(*init_vars)
+  _verify_loop_init_vars(init_vars, symbol_names)
 
   def aug_test(*loop_vars):
     set_state(loop_vars)
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index bbcffa07a06..222f6d7ed97 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -25,14 +25,17 @@ from __future__ import print_function
 import re
 import sys
 
+import numpy as np
 import six
 
 from tensorflow.python.autograph.operators import control_flow
+from tensorflow.python.autograph.operators import special_values
 from tensorflow.python.autograph.utils import ag_logging
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
@@ -519,6 +522,44 @@ class ForLoopTest(test.TestCase):
     # Note: 123 = ((0*10 + 1)*10+2)*10+3 (first element of each row).
     self.assertEqual(self.evaluate(v.read_value()), 123)
 
+  def _basic_loop(self, init_value, body_fn):
+    def body(i):
+      nonlocal s
+      s = body_fn(i, s)
+
+    def set_state(loop_vars):
+      nonlocal s
+      s, = loop_vars
+
+    s = init_value
+    control_flow.for_stmt(
+        constant_op.constant([1, 2, 3, 4]),
+        extra_test=lambda: True,
+        body=body,
+        get_state=lambda: (s,),
+        set_state=set_state,
+        symbol_names=('s',),
+        opts={})
+    return s
+
+  def test_tensor_illegal_input(self):
+    with self.assertRaisesRegex(ValueError, '"s" may not be None'):
+      self._basic_loop(None, lambda i, s: s)
+    with self.assertRaisesRegex(ValueError, '"s" must be defined'):
+      self._basic_loop(special_values.Undefined(''), lambda i, s: s)
+
+  def test_tensor_none_output(self):
+    with self.assertRaisesRegex(ValueError, '"s" is None at the end'):
+      self._basic_loop(0, lambda i, s: None)
+
+  def test_tensor_dtype_change(self):
+    with self.assertRaisesRegex(TypeError, '"s".* dtype float32 after'):
+      self._basic_loop(0, lambda i, s: 1.0)
+
+  def test_tensor_shape_change(self):
+    with self.assertRaisesRegex(ValueError, r'"s".* shape \(1,\) after'):
+      self._basic_loop(0, lambda i, s: np.array([1], dtype=np.int32))
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class WhileLoopTest(test.TestCase):
@@ -718,6 +759,46 @@ class WhileLoopTest(test.TestCase):
         self.assertTrue(re.match(
             r'.* Large unrolled loop.*Add.*', out_capturer.getvalue()))
 
+  def _basic_loop(self, init_value, body_fn):
+    def body():
+      nonlocal i, s
+      s = body_fn(i, s)
+      i += 1
+
+    def set_state(loop_vars):
+      nonlocal i, s
+      i, s = loop_vars
+
+    i = 0
+    n = constant_op.constant(5)
+    s = init_value
+    control_flow.while_stmt(
+        test=lambda: i < n,
+        body=body,
+        get_state=lambda: (i, s),
+        set_state=set_state,
+        symbol_names=('i', 's'),
+        opts={})
+    return s
+
+  def test_tensor_illegal_input(self):
+    with self.assertRaisesRegex(ValueError, '"s" may not be None'):
+      self._basic_loop(None, lambda i, s: s)
+    with self.assertRaisesRegex(ValueError, '"s" must be defined'):
+      self._basic_loop(special_values.Undefined(''), lambda i, s: s)
+
+  def test_tensor_none_output(self):
+    with self.assertRaisesRegex(ValueError, '"s" is None at the end'):
+      self._basic_loop(0, lambda i, s: None)
+
+  def test_tensor_dtype_change(self):
+    with self.assertRaisesRegex(TypeError, '"s".* dtype float32 after'):
+      self._basic_loop(0, lambda i, s: 1.0)
+
+  def test_tensor_shape_change(self):
+    with self.assertRaisesRegex(ValueError, r'"s".* shape \(1,\) after'):
+      self._basic_loop(0, lambda i, s: np.array([1], dtype=np.int32))
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class IfStmtTest(test.TestCase):
@@ -783,6 +864,38 @@ class IfStmtTest(test.TestCase):
     self.assertEqual((1, 2), test_fn(True))
     self.assertEqual((-1, -2), test_fn(False))
 
+  def _basic_cond(self, true_value, false_value):
+    # Eager cond had different semantics, we don't test those here.
+    with func_graph.FuncGraph('tmp').as_default():
+      return control_flow.if_stmt(
+          cond=constant_op.constant(True),
+          body=true_value,
+          orelse=false_value,
+          get_state=lambda: (),
+          set_state=lambda _: None,
+          basic_symbol_names=('s',),
+          composite_symbol_names=())
+
+  def test_tensor_none_output(self):
+    with self.assertRaisesRegex(
+        ValueError, '"s" is None at the end of the TRUE branch'):
+      self._basic_cond(lambda: None, lambda: 1)
+    with self.assertRaisesRegex(
+        ValueError, '"s" is None at the end of the FALSE branch'):
+      self._basic_cond(lambda: 1, lambda: None)
+
+  def test_tensor_undefined_output(self):
+    with self.assertRaisesRegex(
+        ValueError, "must also be initialized in the if.*'s'"):
+      self._basic_cond(lambda: special_values.Undefined('s'), lambda: 1)
+    with self.assertRaisesRegex(
+        ValueError, "must also be initialized in the else.*'s'"):
+      self._basic_cond(lambda: 1, lambda: special_values.Undefined('s'))
+
+  def test_tensor_dtype_change(self):
+    with self.assertRaisesRegex(TypeError, '"s" has dtype int32.*but.*float32'):
+      self._basic_cond(lambda: 1, lambda: 1.0)
+
 
 if __name__ == '__main__':
   test.main()

From be9eb5f03f36ec612fd5d0abb4c5a3a100b5e581 Mon Sep 17 00:00:00 2001
From: Jakob Buchgraber <buchgr@google.com>
Date: Tue, 18 Feb 2020 12:34:48 -0800
Subject: [PATCH 156/442] tensorrt_configure: Factor logic to create local
 repository into its own function

This follows the same pattern as other repository rules. In a follow up change I
will introduce remote_tensorrt_configure that will use _create_local_tensorrt_repository
as its implementation function.

PiperOrigin-RevId: 295797220
Change-Id: Idbb56df088caae114ce23a898464577573257feb
---
 third_party/tensorrt/tensorrt_configure.bzl | 80 ++++++++++++---------
 1 file changed, 45 insertions(+), 35 deletions(-)

diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
index 3466ed3b3bb..484a85649d9 100644
--- a/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -71,45 +71,18 @@ def _create_dummy_repository(repository_ctx):
         "%{tensorrt_version}": "",
     })
 
+    # Copy license file in non-remote build.
+    repository_ctx.template(
+        "LICENSE",
+        Label("@org_tensorflow//third_party/tensorrt:LICENSE"),
+        {},
+    )
+
 def enable_tensorrt(repository_ctx):
     """Returns whether to build with TensorRT support."""
     return int(get_host_environ(repository_ctx, _TF_NEED_TENSORRT, False))
 
-def _tensorrt_configure_impl(repository_ctx):
-    """Implementation of the tensorrt_configure repository rule."""
-
-    if get_host_environ(repository_ctx, _TF_TENSORRT_CONFIG_REPO) != None:
-        # Forward to the pre-configured remote repository.
-        remote_config_repo = get_host_environ(repository_ctx, _TF_TENSORRT_CONFIG_REPO)
-        repository_ctx.template("BUILD", Label(remote_config_repo + ":BUILD"), {})
-        repository_ctx.template(
-            "build_defs.bzl",
-            Label(remote_config_repo + ":build_defs.bzl"),
-            {},
-        )
-        repository_ctx.template(
-            "tensorrt/include/tensorrt_config.h",
-            Label(remote_config_repo + ":tensorrt/include/tensorrt_config.h"),
-            {},
-        )
-        repository_ctx.template(
-            "LICENSE",
-            Label(remote_config_repo + ":LICENSE"),
-            {},
-        )
-        return
-
-    # Copy license file in non-remote build.
-    repository_ctx.template(
-        "LICENSE",
-        Label("//third_party/tensorrt:LICENSE"),
-        {},
-    )
-
-    if not enable_tensorrt(repository_ctx):
-        _create_dummy_repository(repository_ctx)
-        return
-
+def _create_local_tensorrt_repository(repository_ctx):
     # Resolve all labels before doing any real work. Resolving causes the
     # function to be restarted with all previous state being lost. This
     # can easily lead to a O(n^2) runtime in the number of labels.
@@ -159,6 +132,13 @@ def _tensorrt_configure_impl(repository_ctx):
         {"%{copy_rules}": "\n".join(copy_rules)},
     )
 
+    # Copy license file in non-remote build.
+    repository_ctx.template(
+        "LICENSE",
+        Label("@org_tensorflow//third_party/tensorrt:LICENSE"),
+        {},
+    )
+
     # Set up tensorrt_config.h, which is used by
     # tensorflow/stream_executor/dso_loader.cc.
     repository_ctx.template(
@@ -167,6 +147,36 @@ def _tensorrt_configure_impl(repository_ctx):
         {"%{tensorrt_version}": trt_version},
     )
 
+def _tensorrt_configure_impl(repository_ctx):
+    """Implementation of the tensorrt_configure repository rule."""
+
+    if get_host_environ(repository_ctx, _TF_TENSORRT_CONFIG_REPO) != None:
+        # Forward to the pre-configured remote repository.
+        remote_config_repo = repository_ctx.os.environ[_TF_TENSORRT_CONFIG_REPO]
+        repository_ctx.template("BUILD", Label(remote_config_repo + ":BUILD"), {})
+        repository_ctx.template(
+            "build_defs.bzl",
+            Label(remote_config_repo + ":build_defs.bzl"),
+            {},
+        )
+        repository_ctx.template(
+            "tensorrt/include/tensorrt_config.h",
+            Label(remote_config_repo + ":tensorrt/include/tensorrt_config.h"),
+            {},
+        )
+        repository_ctx.template(
+            "LICENSE",
+            Label(remote_config_repo + ":LICENSE"),
+            {},
+        )
+        return
+
+    if not enable_tensorrt(repository_ctx):
+        _create_dummy_repository(repository_ctx)
+        return
+
+    _create_local_tensorrt_repository(repository_ctx)
+
 tensorrt_configure = repository_rule(
     implementation = _tensorrt_configure_impl,
     environ = [

From 0d2f3be5ebe4c762dddad2fe1bac1b4af538de2c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 12:36:06 -0800
Subject: [PATCH 157/442] Automated rollback of commit
 6d00b470f51a62536b3b56c8facc80d871214df5

PiperOrigin-RevId: 295797482
Change-Id: I5218b6ee1d1e8437791520ff2eddd3bed208d199
---
 .../python/keras/layers/preprocessing/BUILD   | 15 ++----
 .../layers/preprocessing/index_lookup.py      | 46 +++----------------
 .../layers/preprocessing/index_lookup_test.py | 36 ++-------------
 .../preprocessing/testdata/repeated_vocab.txt |  5 --
 .../layers/preprocessing/testdata/vocab.txt   |  4 --
 .../tools/pip_package/pip_smoke_test.py       |  1 -
 6 files changed, 14 insertions(+), 93 deletions(-)
 delete mode 100644 tensorflow/python/keras/layers/preprocessing/testdata/repeated_vocab.txt
 delete mode 100644 tensorflow/python/keras/layers/preprocessing/testdata/vocab.txt

diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index 64e8509a599..720e92483fb 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -11,14 +11,6 @@ package(
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "testdata",
-    srcs = [
-        "testdata/repeated_vocab.txt",
-        "testdata/vocab.txt",
-    ],
-)
-
 py_library(
     name = "preprocessing",
     srcs = [
@@ -284,7 +276,6 @@ tf_py_test(
     name = "index_lookup_test",
     size = "medium",
     srcs = ["index_lookup_test.py"],
-    data = [":testdata"],
     python_version = "PY3",
     deps = [
         ":index_lookup",
@@ -312,9 +303,10 @@ cuda_py_test(
 )
 
 tf_py_test(
-    name = "normalization_test",
+    name = "preprocessing_normalization_test",
     size = "small",
     srcs = ["normalization_test.py"],
+    main = "normalization_test.py",
     python_version = "PY3",
     deps = [
         ":normalization",
@@ -325,9 +317,10 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "text_vectorization_test",
+    name = "preprocessing_text_vectorization_test",
     size = "medium",
     srcs = ["text_vectorization_test.py"],
+    main = "text_vectorization_test.py",
     python_version = "PY3",
     deps = [
         ":preprocessing_test_utils",
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup.py b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
index e8c2c0aefc6..7bd7f6683d1 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
@@ -32,7 +32,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.platform import gfile
 from tensorflow.python.util import compat
 
 # The string tokens in the extracted vocabulary
@@ -67,13 +66,7 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
       1. If this value is more than 1, OOV inputs are hashed to determine their
       OOV value; if this value is 0, passing an OOV input will result in a
       runtime error.
-    vocabulary: An optional list of vocabulary terms, or a path to a text file
-      containing a vocabulary to load into this layer. The file should contain
-      one token per line. In either case, the vocabulary must be unique; if
-      the list or file contains the same token multiple times, an error will
-      be thrown. Note that when passing a vocabulary - either as a list or as
-      a file - the vocabulary will not be present in the layer's config dict;
-      it will instead be a part of the layer's weights.
+    vocabulary: An optional list of vocabulary terms.
     reserve_zero: Whether to reserve the index 0, which indicates pad values in
       the Keras masking system. If True, the output of this layer will be in the
       range `[1...max_tokens+1)`; if False, the output will be in the range
@@ -171,38 +164,10 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     self._inverse_table = None
 
     if vocabulary is not None:
-      if isinstance(vocabulary, str):
-        vocabulary = self._get_vocabulary_from_file(vocabulary)
-
-      vocabulary_set = set(vocabulary)
-      if len(vocabulary) != len(vocabulary_set):
-        repeated_items = [
-            item for item, count in collections.Counter(vocabulary).items()
-            if count > 1
-        ]
-        raise ValueError("The passed vocabulary has at least one repeated "
-                         "term. Please uniquify your dataset before passing "
-                         "it to IndexLookup(). The repeated terms are %s" %
-                         repeated_items)
+      self._export_vocab = True
       self.set_vocabulary(vocabulary)
-
-  def _get_vocabulary_from_file(self, vocabulary_path):
-    vocab = []
-    with gfile.GFile(vocabulary_path, "r") as reader:
-      while True:
-        # Get the next line, and break if it is None.
-        text = reader.readline()
-        if not text:
-          break
-
-        # Convert the raw text into UTF8 and strip whitespace.
-        if isinstance(text, str):
-          token = text
-        elif isinstance(text, bytes):
-          token = text.decode("utf-8", "ignore")
-        token = token.strip()
-        vocab.append(token)
-    return vocab
+    else:
+      self._export_vocab = False
 
   def _get_table_data(self):
     keys, values = self._table.export()
@@ -291,10 +256,11 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     return [x for _, x in sorted(zip(values, keys))]
 
   def get_config(self):
+    vocabulary = self.get_vocabulary() if self._export_vocab else None
     config = {
         "max_tokens": self.max_tokens,
         "num_oov_tokens": self.num_oov_tokens,
-        "vocabulary": None,
+        "vocabulary": vocabulary,
         "reserve_zero": self.reserve_zero,
         "mask_zero": self.mask_zero,
     }
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
index 508706cbd93..d0493ed3b95 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
@@ -37,7 +37,6 @@ from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
 
 
@@ -356,13 +355,7 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
     output_dataset = model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
-
-@keras_parameterized.run_all_keras_modes
-class IndexLookupVocabularyTest(keras_parameterized.TestCase,
-                                preprocessing_test_utils.PreprocessingLayerTest
-                               ):
-
-  def test_int_output_explicit_vocab(self):
+  def test_int_output_explicit_vocab_from_config(self):
     vocab_data = ["earth", "wind", "and", "fire"]
     input_array = np.array([["earth", "wind", "and", "fire"],
                             ["fire", "and", "earth", "michigan"]])
@@ -372,20 +365,10 @@ class IndexLookupVocabularyTest(keras_parameterized.TestCase,
     layer = get_layer_class()(vocabulary=vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
 
-  def test_int_output_explicit_vocab_from_file(self):
-    vocab_data = resource_loader.get_path_to_datafile("testdata/vocab.txt")
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(vocabulary=vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
+    with CustomObjectScope({"IndexLookup": get_layer_class()}):
+      new_model = keras.Model.from_config(model.get_config())
+    output_dataset = new_model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
   def test_vocab_appending(self):
@@ -403,17 +386,6 @@ class IndexLookupVocabularyTest(keras_parameterized.TestCase,
     output_dataset = model.predict(input_array)
     self.assertAllClose(expected_output, output_dataset)
 
-  def test_non_unique_vocab_fails(self):
-    vocab_data = ["earth", "wind", "and", "fire", "fire"]
-    with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
-      _ = get_layer_class()(vocabulary=vocab_data)
-
-  def test_non_unique_vocab_from_file_fails(self):
-    vocab_data = resource_loader.get_path_to_datafile(
-        "testdata/repeated_vocab.txt")
-    with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
-      _ = get_layer_class()(vocabulary=vocab_data)
-
 
 @keras_parameterized.run_all_keras_modes
 class InverseLookupOutputTest(keras_parameterized.TestCase,
diff --git a/tensorflow/python/keras/layers/preprocessing/testdata/repeated_vocab.txt b/tensorflow/python/keras/layers/preprocessing/testdata/repeated_vocab.txt
deleted file mode 100644
index 6b3ae610420..00000000000
--- a/tensorflow/python/keras/layers/preprocessing/testdata/repeated_vocab.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-earth
-wind
-and
-fire
-earth
diff --git a/tensorflow/python/keras/layers/preprocessing/testdata/vocab.txt b/tensorflow/python/keras/layers/preprocessing/testdata/vocab.txt
deleted file mode 100644
index dfe3147a3bd..00000000000
--- a/tensorflow/python/keras/layers/preprocessing/testdata/vocab.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-earth
-wind
-and
-fire
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index d89e06a6ac1..7e3643f65b7 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -83,7 +83,6 @@ DEPENDENCY_BLACKLIST = [
     "//tensorflow/core:lmdb_testdata",
     "//tensorflow/core/kernels/cloud:bigquery_reader_ops",
     "//tensorflow/python/debug:grpc_tensorflow_server.par",
-    "//tensorflow/python/keras/layers/preprocessing:testdata",
     "//tensorflow/python/feature_column:vocabulary_testdata",
     "//tensorflow/python:framework/test_file_system.so",
     "//tensorflow/python:util_nest_test_main_lib",

From 60fb12820edb61c496f3fac1ee4dd61338e968b7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 12:55:23 -0800
Subject: [PATCH 158/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 295801134 Change-Id:
 Icc9cc651d6694048fddd8ed461e6911f651090c5

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 86be1ef98aa..ffa9931d561 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21329,7 +21329,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22037,7 +22037,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22233,7 +22233,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22302,7 +22302,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22417,7 +22417,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22476,7 +22476,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22650,7 +22650,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22841,7 +22841,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25281,7 +25281,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25613,7 +25613,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25663,7 +25663,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25913,7 +25913,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26543,7 +26543,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27608,7 +27608,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45467,7 +45467,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 49a83c96b0efca8aab794609b31de17fc7a77813 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Tue, 18 Feb 2020 13:00:47 -0800
Subject: [PATCH 159/442] [MLIR][XLA] Remove `output_dimensions` arg from LHLO
 DynamicBroadcastInDimOp.

It is not needed since we have access to the output buffer.

PiperOrigin-RevId: 295802211
Change-Id: I078c7b91f837e80131a8dde5bb735a8ca72ee876
---
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td    | 23 +++++++++++++++++--
 .../compiler/mlir/xla/ir/hlo_ops_base.td      | 23 -------------------
 tensorflow/compiler/mlir/xla/ir/lhlo_ops.td   | 10 --------
 .../compiler/mlir/xla/tests/lhlo_ops.mlir     |  7 ------
 4 files changed, 21 insertions(+), 42 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index 869995fe68f..e2cd42104b3 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -60,6 +60,13 @@ def HLO_Tuple : NestedTupleOf<[HLO_Tensor, HLO_Token]>;
 
 def HLO_TensorOrTuple : AnyTypeOf<[HLO_Tensor, HLO_Tuple]>;
 
+// Dynamic representation of a shape vector as a tensor. Ideally this would be
+// an index type (as it stores indices) but that is currently disallowed in
+// MLIR.
+def HLO_DimensionTensor : ShapedContainerType<
+    [AnyInteger], And<[IsTensorTypePred, HasAnyRankOfPred<[1]>]>,
+    "a 1D tensor of dimensions">;
+
 // In general, static shaped tensor constraints should be avoided unless
 // it is for a legacy op which is only correct with static shapes.
 def HLO_StaticShapeTensor : StaticShapeTensorOf<[
@@ -771,10 +778,22 @@ def HLO_BroadcastInDimOp : HLO_Op<"broadcast_in_dim",
 }
 
 def HLO_DynamicBroadcastInDimOp : HLO_Op<"dynamic_broadcast_in_dim",
-      [NoSideEffect]>, BASE_HLO_DynamicBroadcastInDimOp {
+      [NoSideEffect]> {
+  string summary = "Broadcast a tensor into the given dynamic shape by adding dimensions.";
+  string description = [{
+    This is a generalization of the BroadcastInDimOp which accepts its output
+    dimensions as an argument. It should eventually supercede the statically
+    shaped original, but is being phased as a separate op in order to support
+    compatibility with lowerings and translations that precede dynamic
+    shapes.
+
+    Note that the `broadcast_dimensions` attribute is optional and if omitted,
+    it is assumed to be an ordered, right-aligned mapping from input to
+    output dimensions.
+  }];
   let arguments = (ins
     HLO_Tensor:$operand,
-    HLO_BASE_DimensionTensor:$output_dimensions,
+    HLO_DimensionTensor:$output_dimensions,
     BroadcastDimAttr:$broadcast_dimensions
   );
 
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
index cace05a0913..64303e86fe0 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
@@ -27,13 +27,6 @@ def HLO_Pred : TypeAlias<I1, "pred (AKA boolean or 1-bit integer)">;
 // matching the matrix to dimensions 1 and 2 of the cuboid.
 def BroadcastDimAttr : OptionalAttr<I64ElementsAttr>;
 
-// Dynamic representation of a shape vector as a tensor. Ideally this would be
-// an index type (as it stores indices) but that is currently disallowed in
-// MLIR.
-def HLO_BASE_DimensionTensor : ShapedContainerType<
-    [AnyInteger], And<[IsTensorTypePred, HasAnyRankOfPred<[1]>]>,
-    "a 1D tensor of dimensions">;
-
 //===----------------------------------------------------------------------===//
 // XLA nullary op definitions.
 //===----------------------------------------------------------------------===//
@@ -817,22 +810,6 @@ class BASE_HLO_BroadcastInDimOp  {
   }];
 }
 
-class BASE_HLO_DynamicBroadcastInDimOp  {
-  string summary = "Broadcast a tensor into the given dynamic shape by adding dimensions.";
-
-  string description = [{
-    This is a generalization of the BroadcastInDimOp which accepts its output
-    dimensions as an argument. It should eventually supercede the statically
-    shaped original, but is being phased as a separate op in order to support
-    compatibility with lowerings and translations that precede dynamic
-    shapes.
-
-    Note that the `broadcast_dimensions` attribute is optional and if omitted,
-    it is assumed to be an ordered, right-aligned mapping from input to
-    output dimensions.
-  }];
-}
-
 class BASE_HLO_CholeskyOp {
   string summary = "Cholesky operator";
 
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
index 411c8a89396..794fee181a6 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
@@ -242,16 +242,6 @@ def LHLO_BroadcastInDimOp : LHLO_Op<"broadcast_in_dim",
   );
 }
 
-def HLO_DynamicBroadcastInDimOp : LHLO_Op<"dynamic_broadcast_in_dim",
-      [NoSideEffect]>, BASE_HLO_DynamicBroadcastInDimOp {
-  let arguments = (ins
-    LHLO_Buffer:$operand,
-    HLO_BASE_DimensionTensor:$output_dimensions,
-    LHLO_Buffer:$output,
-    BroadcastDimAttr:$broadcast_dimensions
-  );
-}
-
 def LHLO_ClampOp : LHLO_Op<"clamp", []>, BASE_HLO_ClampOp {
   let arguments = (ins
     LHLO_Buffer:$min,
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
index 00ad25503d7..9f181d574c0 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
@@ -152,13 +152,6 @@ func @broadcast_in_dim_zero_rank_memref(%arg0: memref<i32>, %out: memref<1x2x3xi
 
 // -----
 
-// CHECK-LABEL: func @dynamic_broadcast_in_dim_memref
-func @dynamic_broadcast_in_dim_memref(%arg0: memref<?x?xi32>, %out: memref<?x?x?xi32>, %shape: tensor<3xi64>) -> () {
-  "xla_lhlo.dynamic_broadcast_in_dim"(%arg0, %shape, %out) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (memref<?x?xi32>, tensor<3xi64>, memref<?x?x?xi32>) -> ()
-  return
-}
-
-// -----
 
 // CHECK-LABEL: func @reduce_memref
 func @reduce_memref(%input: memref<10xf32>, %init: memref<f32>, %out: memref<1xf32>) -> () {

From 36fe0e7aadccfcba4b5dd5ed35c9995dceb6e4b6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 13:01:54 -0800
Subject: [PATCH 160/442] Automated rollback of commit
 19ac5f4f6c44ce98654f26c24bb8cd3971c821ab

PiperOrigin-RevId: 295802414
Change-Id: I344ec4bb8a0a2cb9921f2f36fa86da9c7f2b55e3
---
 .../python/distribute/cross_device_ops.py     |  2 +-
 .../distribute/mirrored_strategy_test.py      |  6 +-
 .../distribute/parameter_server_strategy.py   |  2 +-
 tensorflow/python/distribute/values.py        | 57 +++++++++----------
 tensorflow/python/saved_model/save.py         |  2 +-
 5 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index 9d44f5c554c..4b2814eca3e 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -1032,7 +1032,7 @@ class CollectiveAllReduce(CrossDeviceOps):
           else:
             # TODO(josh11b): Once we add support for model parallelism, get the
             # copy from the corresponding replica instead of the primary.
-            index.append(array_ops.identity(all_reduced._primary))  # pylint: disable=protected-access
+            index.append(array_ops.identity(all_reduced.primary))
     return value_lib.regroup(index, wrap_class=value_lib.Mirrored)
 
   def batch_reduce_implementation(self, reduce_op, value_destination_pairs):
diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py
index fa7e4a8fcd4..b2ab4bb6ec6 100644
--- a/tensorflow/python/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_strategy_test.py
@@ -1334,7 +1334,7 @@ class FunctionTest(test.TestCase):
     def forward(x, w, b):
       return x * w + b
     x = constant_op.constant([1.0], name="x_useless")
-    concrete_forward = forward.get_concrete_function(x, w._primary, b._primary)
+    concrete_forward = forward.get_concrete_function(x, w.primary, b.primary)
 
     with ms.scope():
       def replica_fn():
@@ -1350,8 +1350,8 @@ class FunctionTest(test.TestCase):
       g1, g2 = step_fn()
       run_metadata = context.export_run_metadata()
       context.disable_run_metadata()
-      self.assertEqual(self.evaluate(g1._primary), 1.0)
-      self.assertEqual(self.evaluate(g2._primary), 1.0)
+      self.assertEqual(self.evaluate(g1.primary), 1.0)
+      self.assertEqual(self.evaluate(g2.primary), 1.0)
 
       # Verify that this node runs on both devices.
       node_name = "gradients_mul_grad_mul_1_x"
diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py
index a807d4ae9ff..41ea9e3fcb9 100644
--- a/tensorflow/python/distribute/parameter_server_strategy.py
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@@ -487,7 +487,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
     def _select_fn(x):  # pylint: disable=g-missing-docstring
       if isinstance(x, values.Mirrored):
         if len(x.devices) == 1:
-          return x._primary  # pylint: disable=protected-access
+          return x.primary
         else:
           raise ValueError(
               "You cannot update variable with a Mirrored object with multiple "
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index fb3e2ffd817..570c3c35cbf 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -75,7 +75,7 @@ class DistributedValues(object):
         "replica accesses.")
 
   def _get_closest(self):
-    """Returns value in same replica or device if possible, else the _primary."""
+    """Returns value in same replica or device if possible, else the primary."""
     replica_id = _get_current_replica_id_as_int()
     if replica_id is None:
       # Try to find a value on the current device.
@@ -83,12 +83,12 @@ class DistributedValues(object):
       for value in self._values:
         if device_util.canonicalize(value.device) == current_device:
           return value
-      return self._primary
+      return self.primary
     else:
       return self._values[replica_id]
 
   @property
-  def _primary(self):
+  def primary(self):
     """Returns a representative component."""
     return self._values[0]
 
@@ -368,7 +368,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
   def __init__(self, strategy, values):
     self._distribute_strategy = strategy
     super(DistributedVariable, self).__init__(values)
-    self._common_name = self._primary.name.split(":")[0]
+    self._common_name = self.primary.name.split(":")[0]
     # Use a weakref to make it easy to map from the contained values
     # to the container without introducing a reference cycle.
     for v in values:
@@ -395,7 +395,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
       The op that evaluates to True or False depending on if all the
       component variables are initialized.
     """
-    result = self._primary.is_initialized()
+    result = self.primary.is_initialized()
     # We iterate through the list of values except the last one to allow us to
     # name the final `logical_and` op the same name that is passed by the user
     # to the `is_initialized` op. For distributed variables, the
@@ -426,11 +426,11 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
 
   @property
   def constraint(self):
-    return self._primary.constraint
+    return self.primary.constraint
 
   @property
   def graph(self):
-    return self._primary.graph
+    return self.primary.graph
 
   @property
   def _shared_name(self):
@@ -438,28 +438,28 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
 
   @property
   def _unique_id(self):
-    return self._primary._unique_id  # pylint: disable=protected-access
+    return self.primary._unique_id  # pylint: disable=protected-access
 
   @property
   def _graph_key(self):
     """Lets Optimizers know which graph this variable is from."""
-    return self._primary._graph_key  # pylint: disable=protected-access
+    return self.primary._graph_key  # pylint: disable=protected-access
 
   @property
   def name(self):
-    return self._primary.name
+    return self.primary.name
 
   @property
   def dtype(self):
-    return self._primary.dtype
+    return self.primary.dtype
 
   @property
   def shape(self):
-    return self._primary.shape
+    return self.primary.shape
 
   @property
   def synchronization(self):
-    return self._primary.synchronization
+    return self.primary.synchronization
 
   @property
   def handle(self):
@@ -475,10 +475,10 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
 
   @property
   def _save_slice_info(self):
-    return self._primary._save_slice_info  # pylint: disable=protected-access
+    return self.primary._save_slice_info  # pylint: disable=protected-access
 
   def _get_save_slice_info(self):
-    return self._primary._get_save_slice_info()  # pylint: disable=protected-access
+    return self.primary._get_save_slice_info()  # pylint: disable=protected-access
 
   def _set_save_slice_info(self, save_slice_info):
     for v in self._values:
@@ -490,17 +490,17 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
 
   @property
   def trainable(self):
-    return self._primary.trainable
+    return self.primary.trainable
 
   @property
   def distribute_strategy(self):
     return self._distribute_strategy
 
   def get_shape(self):
-    return self._primary.get_shape()
+    return self.primary.get_shape()
 
   def to_proto(self, export_scope=None):
-    return self._primary.to_proto(export_scope=export_scope)
+    return self.primary.to_proto(export_scope=export_scope)
 
   @property
   def op(self):
@@ -508,13 +508,13 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
     # to work (even if the current device isn't in self.devices), but
     # other uses of var.op in a cross-replica context to fail.
     if distribution_strategy_context.in_cross_replica_context():
-      return DistributedVarOp(self._primary.op.name, self._primary.op.graph,
-                              self._primary.op.traceback, self._primary.op.type)
+      return DistributedVarOp(self.primary.op.name, self.primary.op.graph,
+                              self.primary.op.traceback, self.primary.op.type)
     return self._get().op
 
   @property
   def _in_graph_mode(self):
-    return self._primary._in_graph_mode  # pylint: disable=protected-access
+    return self.primary._in_graph_mode  # pylint: disable=protected-access
 
   def read_value(self):
     with _enter_or_assert_strategy(self._distribute_strategy):
@@ -567,7 +567,7 @@ class TPUVariableMixin(object):
     # Handle ID is needed for `get_replicated_var_handle` to cache the variables
     # correctly since in eager mode different variables can have the same name.
     if ops.executing_eagerly_outside_functions():
-      self._handle_id = self._common_name + "_" + str(id(self._primary))
+      self._handle_id = self._common_name + "_" + str(id(self.primary))
     else:
       self._handle_id = self._common_name
 
@@ -592,7 +592,7 @@ class TPUVariableMixin(object):
     if _enclosing_tpu_context() is None:
       return super(TPUVariableMixin, self)._get_closest()
     else:
-      return self._primary
+      return self.primary
 
   def numpy(self):
     if context.executing_eagerly():
@@ -644,8 +644,8 @@ class TPUVariableMixin(object):
 
   @property
   def op(self):
-    return DistributedVarOp(self._primary.op.name, self._primary.op.graph,
-                            self._primary.op.traceback, self._primary.op.type)
+    return DistributedVarOp(self.primary.op.name, self.primary.op.graph,
+                            self.primary.op.traceback, self.primary.op.type)
 
   def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
     """Converts a variable to a tensor."""
@@ -900,7 +900,7 @@ class MirroredVariable(DistributedVariable, Mirrored):
     """
 
     def _saveable_factory(name=self._common_name):
-      return _MirroredSaveable(self, self._primary, name)
+      return _MirroredSaveable(self, self.primary, name)
 
     return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
 
@@ -1003,8 +1003,7 @@ class _SyncOnReadSaveable(saver.BaseSaverBuilder.SaveableObject):
         slice_spec="",
         name=name,
         dtype=sync_on_read_variable.dtype,
-        device=sync_on_read_variable._primary.device)  # pylint: disable=protected-access
-
+        device=sync_on_read_variable.primary.device)
     super(_SyncOnReadSaveable, self).__init__(tensor, [spec], name)
 
   def restore(self, restored_tensors, restored_shapes):
@@ -1104,7 +1103,7 @@ class SyncOnReadVariable(DistributedVariable):
 
   def _get_cross_replica(self):
     if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-      return self._primary
+      return self.primary
 
     with _enter_or_assert_strategy(self._distribute_strategy):
       return self._distribute_strategy.reduce(
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index ced4135526a..617f5e83a01 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -274,7 +274,7 @@ class _SaveableView(object):
         self.captured_tensor_node_ids[obj.resource_handle] = node_id
       elif (ds_values.is_distributed_variable(obj) or
             resource_variable_ops.is_resource_variable(obj)):
-        obj_to_copy = obj._primary if ds_values.is_distributed_variable(  # pylint: disable=protected-access
+        obj_to_copy = obj.primary if ds_values.is_distributed_variable(
             obj) else obj
         new_variable = resource_variable_ops.copy_to_graph_uninitialized(
             obj_to_copy)

From 31a9e7ac5bb176d0a84eaaf2eb9d1e27c98ce9ee Mon Sep 17 00:00:00 2001
From: Lakshay Tokas <lakshay.tokas@intel.com>
Date: Tue, 18 Feb 2020 13:19:48 -0800
Subject: [PATCH 161/442] Used the refactored method and fixed the typo in the
 comment

---
 tensorflow/core/kernels/mkl_softmax_op.cc | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index b9f8e590d0e..768a63ba9c0 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -64,12 +64,8 @@ class MklSoftmaxPrimitive : public MklPrimitive {
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
 
 #ifdef ENABLE_MKLDNN_V1
-    DCHECK_EQ(context_.fwd_primitives.size(),
-              context_.fwd_net_args.size());
-    for (size_t i = 0; i < context_.fwd_primitives.size(); ++i) {
-      context_.fwd_primitives.at(i).execute(*context_.fwd_stream,
-                                            context_.fwd_net_args.at(i));
-    }
+    execute_primitives(context_.fwd_primitives, context_.fwd_stream,
+                       context_.net_args);
 #else
     context_.fwd_stream->submit(context_.fwd_primitives);
 #endif
@@ -120,7 +116,7 @@ class MklSoftmaxPrimitive : public MklPrimitive {
     context_.src_md.reset(
         new memory::desc({fwdParams.src_dims}, MklDnnType<T>(), src_format));
 
-    // Create softmax decriptor and primitive descriptor.
+    // Create softmax descriptor and primitive descriptor.
     context_.fwd_desc.reset(new mkldnn::softmax_forward::desc(
         prop_kind::forward_scoring, *context_.src_md, fwdParams.axis));
     context_.fwd_pd.reset(new mkldnn::softmax_forward::primitive_desc(
@@ -136,8 +132,8 @@ class MklSoftmaxPrimitive : public MklPrimitive {
     // Create softmax primitive and add it to net
     context_.softmax_fwd.reset(new mkldnn::softmax_forward(*context_.fwd_pd));
     context_.fwd_net_args.push_back({{MKLDNN_ARG_SRC, *context_.src_mem},
-                                            { MKLDNN_ARG_DST,
-                                              *context_.dst_mem }});
+                                     { MKLDNN_ARG_DST,
+                                       *context_.dst_mem }});
 #else
     context_.softmax_fwd.reset(new mkldnn::softmax_forward(
         *context_.fwd_pd, *context_.src_mem, *context_.dst_mem));
@@ -311,9 +307,9 @@ class MklSoftmaxOp : public OpKernel {
       // Execute softmax primitive.
       softmax_fwd->Execute(src_data, dst_data);
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
-                         string(e.message) + ", in file " + string(__FILE__) +
-                         ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           context,
           errors::Aborted("Operation received an exception:", error_msg));

From 5f3a3019baf611d3720e70c902fd8170dfe3c0b4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 13:05:19 -0800
Subject: [PATCH 162/442] Replace NodeDef with std::shared_ptr<NodeProperties>
 in the kernel creation code paths and try to avoid as many copies of NodeDefs
 as possible. This will in most cases allow sharing the NodeDef between the
 OpKernel and the graph Node from which it is created.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reduces the number of allocations in the executor benchmark by about 8%:

name                                                 old time/op             new time/op             delta
BM_executor/16/1k       [Nodes = 9824  ]              911µs ± 3%              911µs ± 1%    ~     (p=0.548 n=5+5)
BM_executor/32/8k       [Nodes = 141991]             17.1ms ± 2%             16.8ms ± 1%  -2.17%  (p=0.016 n=5+5)
BM_executor/1k/16       [Nodes = 6781  ]             1.21ms ± 1%             1.25ms ± 7%    ~     (p=0.095 n=5+5)
BM_executor/8k/32       [Nodes = 130875]              4.35s ± 0%              4.34s ± 0%    ~     (p=0.841 n=5+5)
BM_executor/1k/1k       [Nodes = 526256]              3.33s ± 1%              3.31s ± 1%    ~     (p=0.095 n=5+5)
BM_FeedInputFetchOutput                              54.0µs ± 7%             56.9µs ±13%    ~     (p=0.222 n=5+5)

name                                                 old allocs/op           new allocs/op           delta
BM_executor/16/1k       [Nodes = 9824  ]              15.4k ± 0%              14.1k ± 0%  -7.95%  (p=0.008 n=5+5)
BM_executor/32/8k       [Nodes = 141991]               226k ± 0%               208k ± 0%  -7.86%  (p=0.008 n=5+5)
BM_executor/1k/16       [Nodes = 6781  ]              10.2k ± 0%               9.3k ± 0%  -8.36%  (p=0.008 n=5+5)
BM_executor/8k/32       [Nodes = 130875]               197k ± 0%               180k ± 0%  -8.31%  (p=0.016 n=4+5)
BM_executor/1k/1k       [Nodes = 526256]               771k ± 0%               706k ± 0%  -8.53%  (p=0.008 n=5+5)
BM_FeedInputFetchOutput                                58.0 ± 0%               57.0 ± 0%  -1.72%  (p=0.008 n=5+5)

PiperOrigin-RevId: 295803318
Change-Id: I0d262c6082822023f449f9817dc943d20bd302d5
---
 tensorflow/compiler/jit/xla_kernel_creator.cc |  16 +-
 tensorflow/compiler/jit/xla_kernel_creator.h  |   8 +-
 .../compiler/jit/xla_kernel_creator_test.cc   |  42 ++---
 .../compiler/jit/xla_kernel_creator_util.cc   |  13 +-
 .../tf2tensorrt/kernels/trt_engine_op_test.cc |  11 +-
 tensorflow/compiler/tf2xla/graph_compiler.cc  |   2 +-
 tensorflow/core/BUILD                         |   2 +
 .../core/common_runtime/direct_session.cc     |  37 ++--
 .../common_runtime/eager/kernel_and_device.cc |   5 +-
 tensorflow/core/common_runtime/executor.cc    |   8 +-
 tensorflow/core/common_runtime/executor.h     |  10 +-
 .../core/common_runtime/executor_test.cc      |  11 +-
 tensorflow/core/common_runtime/function.cc    |  63 ++++---
 .../core/common_runtime/function_test.cc      |  11 +-
 .../core/common_runtime/graph_runner.cc       |   7 +-
 .../kernel_benchmark_testlib.cc               |   7 +-
 .../core/distributed_runtime/graph_mgr.cc     |  36 ++--
 tensorflow/core/framework/BUILD               |  20 +++
 tensorflow/core/framework/function.h          |  17 +-
 tensorflow/core/framework/node_properties.cc  |  39 +++++
 tensorflow/core/framework/node_properties.h   |  63 +++++++
 .../core/framework/node_properties_test.cc    | 128 ++++++++++++++
 tensorflow/core/framework/op_kernel.cc        | 158 ++++++++++--------
 tensorflow/core/framework/op_kernel.h         |  92 +++++-----
 tensorflow/core/framework/op_kernel_test.cc   |   1 +
 tensorflow/core/graph/graph.cc                |  21 +--
 tensorflow/core/graph/graph.h                 |   5 +-
 tensorflow/core/kernels/constant_op.cc        |  19 +--
 .../core/kernels/data/dataset_test_base.cc    |  14 +-
 .../kernels/data/single_threaded_executor.cc  |   3 +-
 .../data/single_threaded_executor_test.cc     |  11 +-
 tensorflow/python/eager/pywrap_tfe_test.py    |   3 +-
 32 files changed, 597 insertions(+), 286 deletions(-)
 create mode 100644 tensorflow/core/framework/node_properties.cc
 create mode 100644 tensorflow/core/framework/node_properties.h
 create mode 100644 tensorflow/core/framework/node_properties_test.cc

diff --git a/tensorflow/compiler/jit/xla_kernel_creator.cc b/tensorflow/compiler/jit/xla_kernel_creator.cc
index 6ee1db2c7c5..fd6fd4b5b58 100644
--- a/tensorflow/compiler/jit/xla_kernel_creator.cc
+++ b/tensorflow/compiler/jit/xla_kernel_creator.cc
@@ -20,15 +20,17 @@ limitations under the License.
 
 namespace tensorflow {
 
-bool XlaKernelCreator::CanCreateKernel(const FunctionLibraryRuntime& flr,
-                                       const NodeDef& node_def) const {
-  return CanCreateXlaKernel(node_def);
+bool XlaKernelCreator::CanCreateKernel(
+    const FunctionLibraryRuntime& flr,
+    const std::shared_ptr<const NodeProperties>& props) const {
+  return CanCreateXlaKernel(props->node_def);
 }
 
-Status XlaKernelCreator::CreateKernel(FunctionLibraryRuntime* flr,
-                                      const NodeDef& node_def,
-                                      std::unique_ptr<OpKernel>* kernel) const {
-  return CreateXlaKernel(flr, node_def, kernel);
+Status XlaKernelCreator::CreateKernel(
+    FunctionLibraryRuntime* flr,
+    const std::shared_ptr<const NodeProperties>& props,
+    std::unique_ptr<OpKernel>* kernel) const {
+  return CreateXlaKernel(flr, props->node_def, kernel);
 }
 
 namespace {
diff --git a/tensorflow/compiler/jit/xla_kernel_creator.h b/tensorflow/compiler/jit/xla_kernel_creator.h
index 8815ee49ce5..856701a791d 100644
--- a/tensorflow/compiler/jit/xla_kernel_creator.h
+++ b/tensorflow/compiler/jit/xla_kernel_creator.h
@@ -29,11 +29,13 @@ class XlaKernelCreator : public CustomKernelCreator {
   // Given a NodeDef 'node_def' and the function library runtime 'flr', returns
   // true if 'node_def' is a call to a compilable function defined in 'flr',
   // with the kXlaCompileAttr set.
-  bool CanCreateKernel(const FunctionLibraryRuntime& flr,
-                       const NodeDef& node_def) const override;
+  bool CanCreateKernel(
+      const FunctionLibraryRuntime& flr,
+      const std::shared_ptr<const NodeProperties>& props) const override;
 
   // Given a supported NodeDef, returns a XlaLaunchOp that computes the node.
-  Status CreateKernel(FunctionLibraryRuntime* flr, const NodeDef& node_def,
+  Status CreateKernel(FunctionLibraryRuntime* flr,
+                      const std::shared_ptr<const NodeProperties>& props,
                       std::unique_ptr<OpKernel>* kernel) const override;
 };
 
diff --git a/tensorflow/compiler/jit/xla_kernel_creator_test.cc b/tensorflow/compiler/jit/xla_kernel_creator_test.cc
index 7ec37332906..ad94d60d9b5 100644
--- a/tensorflow/compiler/jit/xla_kernel_creator_test.cc
+++ b/tensorflow/compiler/jit/xla_kernel_creator_test.cc
@@ -30,10 +30,12 @@ limitations under the License.
 
 namespace tensorflow {
 
-NodeDef ToNodeDef(const string& text) {
+std::shared_ptr<NodeProperties> ToNodeProperties(const string& text) {
   NodeDef node_def;
+  DataTypeVector dummy;
   EXPECT_TRUE(protobuf::TextFormat::MergeFromString(text, &node_def));
-  return node_def;
+  return std::make_shared<NodeProperties>(nullptr, std::move(node_def), dummy,
+                                          dummy);
 }
 
 // Create a FunctionDef that takes one resource and one regular param
@@ -98,11 +100,11 @@ TEST_F(XlaKernelCreatorTest, OneFloatOneResourceArgument) {
   (*fdef.mutable_attr())["_XlaMustCompile"] = BoolAttr(true);
   Init({fdef});
   XlaKernelCreator xla_kernel_creator;
-  NodeDef callsite =
-      ToNodeDef(R"pb(
+  auto callsite =
+      ToNodeProperties(R"pb(
         name: 'XTimesY' op: 'XTimesY' input: 'a' input: 'b'
       )pb");
-  (*callsite.mutable_attr())["_XlaMustCompile"] = BoolAttr(true);
+  (*(callsite->node_def.mutable_attr()))["_XlaMustCompile"] = BoolAttr(true);
 
   // Note: need to set attribute on the created node.
   Status status = xla_kernel_creator.CreateKernel(flr_, callsite, &kernel_);
@@ -127,13 +129,14 @@ TEST_F(XlaKernelCreatorTest, FailsIfXlaCompileAttrNotSet) {
   Init({fdef});
   XlaKernelCreator xla_kernel_creator;
 
-  Status status = xla_kernel_creator.CreateKernel(flr_, ToNodeDef(R"proto(
-                                                    name: 'XTimesY'
-                                                    op: 'XTimesY'
-                                                    input: 'a'
-                                                    input: 'b'
-                                                  )proto"),
-                                                  &kernel_);
+  Status status =
+      xla_kernel_creator.CreateKernel(flr_, ToNodeProperties(R"proto(
+                                        name: 'XTimesY'
+                                        op: 'XTimesY'
+                                        input: 'a'
+                                        input: 'b'
+                                      )proto"),
+                                      &kernel_);
   EXPECT_TRUE(errors::IsInternal(status)) << status.ToString();
 }
 
@@ -143,13 +146,14 @@ TEST_F(XlaKernelCreatorTest, FailsIfXlaCompileAttrIsSetToFalse) {
   Init({fdef});
   XlaKernelCreator xla_kernel_creator;
 
-  Status status = xla_kernel_creator.CreateKernel(flr_, ToNodeDef(R"proto(
-                                                    name: 'XTimesY'
-                                                    op: 'XTimesY'
-                                                    input: 'a'
-                                                    input: 'b'
-                                                  )proto"),
-                                                  &kernel_);
+  Status status =
+      xla_kernel_creator.CreateKernel(flr_, ToNodeProperties(R"proto(
+                                        name: 'XTimesY'
+                                        op: 'XTimesY'
+                                        input: 'a'
+                                        input: 'b'
+                                      )proto"),
+                                      &kernel_);
   EXPECT_TRUE(errors::IsInternal(status)) << status.ToString();
 }
 
diff --git a/tensorflow/compiler/jit/xla_kernel_creator_util.cc b/tensorflow/compiler/jit/xla_kernel_creator_util.cc
index 5aab0ff3bd6..de091fc93b4 100644
--- a/tensorflow/compiler/jit/xla_kernel_creator_util.cc
+++ b/tensorflow/compiler/jit/xla_kernel_creator_util.cc
@@ -218,12 +218,13 @@ Status CreateXlaKernel(FunctionLibraryRuntime* flr, const NodeDef& node_def,
   TF_RETURN_IF_ERROR(NameAndAttrsFromFunctionCall(node_def, &function));
   Device* dev = flr->device();
   Status s;
-  OpKernelConstruction construction(
-      DeviceType(dev->device_type()), dev,
-      dev->GetAllocator(AllocatorAttributes()), &node_def,
-      &fbody->fdef.signature(), flr, dev->resource_manager(), fbody->arg_types,
-      input_memory_types, fbody->ret_types, output_memory_types,
-      flr->graph_def_version(), &s);
+  auto props = std::make_shared<NodeProperties>(
+      &fbody->fdef.signature(), node_def, fbody->arg_types, fbody->ret_types);
+  OpKernelConstruction construction(DeviceType(dev->device_type()), dev,
+                                    dev->GetAllocator(AllocatorAttributes()),
+                                    flr, dev->resource_manager(), props,
+                                    input_memory_types, output_memory_types,
+                                    flr->graph_def_version(), &s);
 
   *kernel = absl::make_unique<XlaLocalLaunchBase>(
       &construction, constant_arg_indices, resource_arg_indices, function,
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
index a88f2b5e29e..bc42de6832d 100644
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc
@@ -127,9 +127,14 @@ class TRTEngineOpTestBase : public OpsTestBase {
  private:
   Status InitOpWithFunctionLibrary() {
     OpKernel* kernel = nullptr;
-    Status status = CreateOpKernel(device_type_, device_, allocator(),
-                                   pflr_->GetFLR(device_->name()), node_def_,
-                                   TF_GRAPH_DEF_VERSION, &kernel);
+    auto flr = pflr_->GetFLR(device_->name());
+    std::shared_ptr<const NodeProperties> props;
+    Status status = NodeProperties::CreateFromNodeDef(
+        node_def_, flr->GetFunctionLibraryDefinition(), &props);
+    if (status.ok()) {
+      status.Update(CreateOpKernel(device_type_, device_, allocator(), flr,
+                                   props, TF_GRAPH_DEF_VERSION, &kernel));
+    }
     kernel_ = std::unique_ptr<OpKernel>(kernel);
     if (kernel_ != nullptr) input_types_ = kernel_->input_types();
     return status;
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index 34888fc0e2f..f0aebc9b543 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -133,7 +133,7 @@ Status GraphCompiler::Compile() {
     OpKernel* op_kernel_raw = nullptr;
     // The kernel is not actually run for functional ops, we just need it
     // for metadata.
-    Status s = flib_->CreateKernel(n->def(), &op_kernel_raw);
+    Status s = flib_->CreateKernel(n->properties(), &op_kernel_raw);
     // Transfer ownership of the kernel to a local smart pointer.
     std::unique_ptr<OpKernel> op_kernel(op_kernel_raw);
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index b89068c7a83..4f0df417037 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -472,6 +472,7 @@ tf_cuda_library(
         "//tensorflow/core/framework:memory_types.h",
         "//tensorflow/core/framework:node_def_builder.h",
         "//tensorflow/core/framework:node_def_util.h",
+        "//tensorflow/core/framework:node_properties.h",
         "//tensorflow/core/framework:numeric_op.h",
         "//tensorflow/core/framework:numeric_types.h",
         "//tensorflow/core/framework:op.h",
@@ -2323,6 +2324,7 @@ tf_cuda_library(
         "//tensorflow/core/framework:bfloat16",
         "//tensorflow/core/framework:common_shape_fns",
         "//tensorflow/core/framework:node_def_util",
+        "//tensorflow/core/framework:node_properties",
         "//tensorflow/core/framework:numeric_types",
         "//tensorflow/core/framework:op",
         "//tensorflow/core/framework:op_def_builder",
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 098217a607a..a196f74c65b 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -1356,24 +1356,25 @@ Status DirectSession::CreateExecutors(
     params.session_metadata = session_metadata;
     params.function_library = lib;
     auto opseg = device->op_segment();
-    params.create_kernel = [this, lib, opseg](const NodeDef& ndef,
-                                              OpKernel** kernel) {
-      // NOTE(mrry): We must not share function kernels (implemented
-      // using `CallOp`) between subgraphs, because `CallOp::handle_`
-      // is tied to a particular subgraph. Even if the function itself
-      // is stateful, the `CallOp` that invokes it is not.
-      if (!OpSegment::ShouldOwnKernel(lib, ndef.op())) {
-        return lib->CreateKernel(ndef, kernel);
-      }
-      auto create_fn = [lib, &ndef](OpKernel** kernel) {
-        return lib->CreateKernel(ndef, kernel);
-      };
-      // Kernels created for subgraph nodes need to be cached.  On
-      // cache miss, create_fn() is invoked to create a kernel based
-      // on the function library here + global op registry.
-      return opseg->FindOrCreate(session_handle_, ndef.name(), kernel,
-                                 create_fn);
-    };
+    params.create_kernel =
+        [this, lib, opseg](const std::shared_ptr<const NodeProperties>& props,
+                           OpKernel** kernel) {
+          // NOTE(mrry): We must not share function kernels (implemented
+          // using `CallOp`) between subgraphs, because `CallOp::handle_`
+          // is tied to a particular subgraph. Even if the function itself
+          // is stateful, the `CallOp` that invokes it is not.
+          if (!OpSegment::ShouldOwnKernel(lib, props->node_def.op())) {
+            return lib->CreateKernel(props, kernel);
+          }
+          auto create_fn = [lib, &props](OpKernel** kernel) {
+            return lib->CreateKernel(props, kernel);
+          };
+          // Kernels created for subgraph nodes need to be cached.  On
+          // cache miss, create_fn() is invoked to create a kernel based
+          // on the function library here + global op registry.
+          return opseg->FindOrCreate(session_handle_, props->node_def.name(),
+                                     kernel, create_fn);
+        };
     params.delete_kernel = [lib](OpKernel* kernel) {
       if (kernel && !OpSegment::ShouldOwnKernel(lib, kernel->type_string()))
         delete kernel;
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 6e8a5b9689a..8ca02ca51c0 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -98,7 +98,10 @@ Status KernelAndDeviceOp::Init(const NodeDef& ndef,
         "A valid FunctionLibraryRuntime must be provided when running ops "
         "based on OpKernel.");
   }
-  TF_RETURN_IF_ERROR(flr_->CreateKernel(ndef, &k));
+  std::shared_ptr<const NodeProperties> props;
+  TF_RETURN_IF_ERROR(NodeProperties::CreateFromNodeDef(
+      ndef, flr_->GetFunctionLibraryDefinition(), &props));
+  TF_RETURN_IF_ERROR(flr_->CreateKernel(props, &k));
   kernel_.reset(k);
 
   input_alloc_attrs_.resize(kernel_->num_inputs());
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index bd3e14129b3..3a43a193b9e 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -654,7 +654,7 @@ Status ExecutorImpl::Initialize(const Graph& graph) {
     item->input_start = frame_info->total_inputs;
     frame_info->total_inputs += n->num_inputs();
 
-    Status s = params_.create_kernel(n->def(), &item->kernel);
+    Status s = params_.create_kernel(n->properties(), &item->kernel);
     if (!s.ok()) {
       item->kernel = nullptr;
       s = AttachDef(s, *n);
@@ -2974,12 +2974,12 @@ Status NewLocalExecutor(const LocalExecutorParams& params, const Graph& graph,
 }
 
 Status CreateNonCachedKernel(Device* device, FunctionLibraryRuntime* flib,
-                             const NodeDef& ndef, int graph_def_version,
-                             OpKernel** kernel) {
+                             const std::shared_ptr<const NodeProperties>& props,
+                             int graph_def_version, OpKernel** kernel) {
   const auto device_type = DeviceType(device->attributes().device_type());
   auto allocator = device->GetAllocator(AllocatorAttributes());
   return CreateOpKernel(device_type, device, allocator, flib,
-                        device->resource_manager(), ndef, graph_def_version,
+                        device->resource_manager(), props, graph_def_version,
                         kernel);
 }
 
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index a7cb01ec7f0..fcc64b9d986 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -145,7 +145,9 @@ struct LocalExecutorParams {
   // create_kernel returns an instance of op kernel based on NodeDef.
   // delete_kernel is called for every kernel used by the executor
   // when the executor is deleted.
-  std::function<Status(const NodeDef&, OpKernel**)> create_kernel;
+  std::function<Status(const std::shared_ptr<const NodeProperties>&,
+                       OpKernel**)>
+      create_kernel;
   std::function<void(OpKernel*)> delete_kernel;
 
   Executor::RendezvousFactory rendezvous_factory;
@@ -240,12 +242,12 @@ class ExecutorBarrier {
 
 // A few helpers to facilitate create/delete kernels.
 
-// Creates a kernel based on "ndef" on device "device". The kernel can
+// Creates a kernel based on "props" on device "device". The kernel can
 // access the functions in the "flib". The caller takes ownership of
 // returned "*kernel".
 Status CreateNonCachedKernel(Device* device, FunctionLibraryRuntime* flib,
-                             const NodeDef& ndef, int graph_def_version,
-                             OpKernel** kernel);
+                             const std::shared_ptr<const NodeProperties>& props,
+                             int graph_def_version, OpKernel** kernel);
 
 // Deletes "kernel" returned by CreateKernel.
 void DeleteNonCachedKernel(OpKernel* kernel);
diff --git a/tensorflow/core/common_runtime/executor_test.cc b/tensorflow/core/common_runtime/executor_test.cc
index e994512a43f..3f143c75714 100644
--- a/tensorflow/core/common_runtime/executor_test.cc
+++ b/tensorflow/core/common_runtime/executor_test.cc
@@ -61,11 +61,12 @@ class ExecutorTest : public ::testing::Test {
     const int version = graph->versions().producer();
     LocalExecutorParams params;
     params.device = device_.get();
-    params.create_kernel = [this, version](const NodeDef& ndef,
-                                           OpKernel** kernel) {
-      return CreateNonCachedKernel(device_.get(), nullptr, ndef, version,
-                                   kernel);
-    };
+    params.create_kernel =
+        [this, version](const std::shared_ptr<const NodeProperties>& props,
+                        OpKernel** kernel) {
+          return CreateNonCachedKernel(device_.get(), nullptr, props, version,
+                                       kernel);
+        };
     params.delete_kernel = [](OpKernel* kernel) {
       DeleteNonCachedKernel(kernel);
     };
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 14c0a8f5ad2..2140bf7f72b 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -187,7 +187,8 @@ class FunctionLibraryRuntimeOverlay : public FunctionLibraryRuntime {
   void Run(const Options& opts, Handle handle, CallFrameInterface* call_frame,
            DoneCallback done) override;
 
-  Status CreateKernel(const NodeDef& ndef, OpKernel** kernel) override;
+  Status CreateKernel(const std::shared_ptr<const NodeProperties>& props,
+                      OpKernel** kernel) override;
 
   bool IsStateful(const string& function_name) const override;
 
@@ -256,7 +257,8 @@ void FunctionLibraryRuntimeOverlay::Run(const Options& opts, Handle handle,
   base_flr_->Run(opts, handle, call_frame, std::move(done));
 }
 
-Status FunctionLibraryRuntimeOverlay::CreateKernel(const NodeDef&, OpKernel**) {
+Status FunctionLibraryRuntimeOverlay::CreateKernel(
+    const std::shared_ptr<const NodeProperties>&, OpKernel**) {
   // We don't have access to base_lib_def_ in base function library runtime (aka
   // FunctionLibraryRuntimeImpl), so to make sure we do not create a kernel with
   // the wrong lib_def we just disable creation of new kernels through overlays.
@@ -344,7 +346,8 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
 
   Status GetRetTypes(Handle handle, DataTypeVector* ret_types) override;
 
-  Status CreateKernel(const NodeDef& ndef, OpKernel** kernel) override;
+  Status CreateKernel(const std::shared_ptr<const NodeProperties>& props,
+                      OpKernel** kernel) override;
 
   void Run(const Options& opts, Handle handle, gtl::ArraySlice<Tensor> args,
            std::vector<Tensor>* rets, DoneCallback done) override;
@@ -393,7 +396,9 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   const string device_name_;
 
   std::function<Status(const string&, const OpDef**)> get_func_sig_;
-  std::function<Status(const NodeDef&, OpKernel**)> create_kernel_;
+  std::function<Status(const std::shared_ptr<const NodeProperties>&,
+                       OpKernel**)>
+      create_kernel_;
 
   mutable mutex mu_;
 
@@ -426,8 +431,8 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   // to use for kernel creation and execution. In particular, this method can
   // accept a FunctionLibraryRuntimeOverlay that overlays a different
   // FunctionLibraryDefinition.
-  Status CreateKernel(const NodeDef& ndef, FunctionLibraryRuntime* flr,
-                      OpKernel** kernel);
+  Status CreateKernel(const std::shared_ptr<const NodeProperties>& props,
+                      FunctionLibraryRuntime* flr, OpKernel** kernel);
   Status FunctionDefToBody(const FunctionDef& fdef, AttrSlice attrs,
                            const FunctionLibraryDefinition* lib_def,
                            std::unique_ptr<FunctionBody>* fbody);
@@ -476,8 +481,9 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
   get_func_sig_ = [this](const string& op, const OpDef** sig) {
     return base_lib_def_->LookUpOpDef(op, sig);
   };
-  create_kernel_ = [this](const NodeDef& ndef, OpKernel** kernel) {
-    return CreateKernel(ndef, kernel);
+  create_kernel_ = [this](const std::shared_ptr<const NodeProperties>& props,
+                          OpKernel** kernel) {
+    return CreateKernel(props, kernel);
   };
   thread::ThreadPool* pool = nullptr;
   if (device_ != nullptr) {
@@ -589,20 +595,20 @@ Status FunctionLibraryRuntimeImpl::GetRetTypes(Handle h,
   return Status::OK();
 }
 
-Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
-                                                OpKernel** kernel) {
-  return CreateKernel(ndef, this, kernel);
+Status FunctionLibraryRuntimeImpl::CreateKernel(
+    const std::shared_ptr<const NodeProperties>& props, OpKernel** kernel) {
+  return CreateKernel(props, this, kernel);
 }
 
-Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
-                                                FunctionLibraryRuntime* flr,
-                                                OpKernel** kernel) {
+Status FunctionLibraryRuntimeImpl::CreateKernel(
+    const std::shared_ptr<const NodeProperties>& props,
+    FunctionLibraryRuntime* flr, OpKernel** kernel) {
   // If a custom kernel creator is given, try that.
   Status s;
   if (custom_kernel_creator_ != nullptr &&
-      custom_kernel_creator_->CanCreateKernel(*this, ndef)) {
+      custom_kernel_creator_->CanCreateKernel(*this, props)) {
     std::unique_ptr<OpKernel> ret;
-    s = custom_kernel_creator_->CreateKernel(this, ndef, &ret);
+    s = custom_kernel_creator_->CreateKernel(this, props, &ret);
     if (s.ok()) {
       *kernel = ret.release();
     } else {
@@ -613,9 +619,9 @@ Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
 
   const FunctionLibraryDefinition* lib_def =
       flr->GetFunctionLibraryDefinition();
-  if (lib_def->Find(ndef.op()) == nullptr) {
+  if (lib_def->Find(props->node_def.op()) == nullptr) {
     // A primitive operation. Creates the registered kernel.
-    return CreateNonCachedKernel(device_, flr, ndef, graph_def_version_,
+    return CreateNonCachedKernel(device_, flr, props, graph_def_version_,
                                  kernel);
   }
 
@@ -626,8 +632,9 @@ Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
     options.lib_def = lib_def;
   }
   Handle handle;
-  TF_RETURN_IF_ERROR(
-      Instantiate(ndef.op(), AttrSlice(&ndef.attr()), options, &handle));
+  TF_RETURN_IF_ERROR(Instantiate(props->node_def.op(),
+                                 AttrSlice(&props->node_def.attr()), options,
+                                 &handle));
 
   const FunctionBody* fbody = GetFunctionBody(handle);
   CHECK_NOTNULL(fbody);
@@ -647,10 +654,12 @@ Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
 
   // Constructs a CallOp kernel for running the instantiated function.
   auto device_type = DeviceType(device_->attributes().device_type());
+  auto new_props = std::make_shared<NodeProperties>(
+      &fbody->fdef.signature(), props->node_def, fbody->arg_types,
+      fbody->ret_types);
   OpKernelConstruction construction(
-      device_type, device_, device_->GetAllocator(AllocatorAttributes()), &ndef,
-      &fbody->fdef.signature(), flr, device_->resource_manager(),
-      fbody->arg_types, input_memory_types, fbody->ret_types,
+      device_type, device_, device_->GetAllocator(AllocatorAttributes()), flr,
+      device_->resource_manager(), props, input_memory_types,
       output_memory_types, graph_def_version_, &s);
   if (s.ok()) {
     *kernel = new CallOp(handle, &construction);
@@ -953,9 +962,11 @@ Status FunctionLibraryRuntimeImpl::CreateItem(Item** item) {
   if (flr == this) {
     params.create_kernel = create_kernel_;
   } else {
-    params.create_kernel = [this, flr](const NodeDef& ndef, OpKernel** kernel) {
-      return CreateKernel(ndef, flr, kernel);
-    };
+    params.create_kernel =
+        [this, flr](const std::shared_ptr<const NodeProperties>& props,
+                    OpKernel** kernel) {
+          return CreateKernel(props, flr, kernel);
+        };
   }
   params.delete_kernel = [](OpKernel* kernel) {
     DeleteNonCachedKernel(kernel);
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index c1247190d2d..3e2371a686a 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -90,11 +90,12 @@ class FunctionTest : public ::testing::Test {
     const int version = g->versions().producer();
     LocalExecutorParams params;
     params.device = device_.get();
-    params.create_kernel = [this, version](const NodeDef& ndef,
-                                           OpKernel** kernel) {
-      return CreateNonCachedKernel(device_.get(), nullptr, ndef, version,
-                                   kernel);
-    };
+    params.create_kernel =
+        [this, version](const std::shared_ptr<const NodeProperties>& props,
+                        OpKernel** kernel) {
+          return CreateNonCachedKernel(device_.get(), nullptr, props, version,
+                                       kernel);
+        };
     params.delete_kernel = [](OpKernel* kernel) {
       DeleteNonCachedKernel(kernel);
     };
diff --git a/tensorflow/core/common_runtime/graph_runner.cc b/tensorflow/core/common_runtime/graph_runner.cc
index 0a7d50f9ea4..7ffb860a2ce 100644
--- a/tensorflow/core/common_runtime/graph_runner.cc
+++ b/tensorflow/core/common_runtime/graph_runner.cc
@@ -157,9 +157,10 @@ Status GraphRunner::Run(Graph* graph, FunctionLibraryRuntime* function_library,
   params.device = device_;
   params.function_library = function_library;
   const int producer = graph_to_run->versions().producer();
-  params.create_kernel = [this, function_library, producer](const NodeDef& ndef,
-                                                            OpKernel** kernel) {
-    return CreateNonCachedKernel(device_, function_library, ndef, producer,
+  params.create_kernel = [this, function_library, producer](
+                             const std::shared_ptr<const NodeProperties>& props,
+                             OpKernel** kernel) {
+    return CreateNonCachedKernel(device_, function_library, props, producer,
                                  kernel);
   };
   params.delete_kernel = [](OpKernel* kernel) { delete kernel; };
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
index fe703050602..4118534cb3e 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -84,9 +84,10 @@ Benchmark::Benchmark(const string& device, Graph* g,
   LocalExecutorParams params;
   params.device = device_.get();
   params.function_library = nullptr;
-  params.create_kernel = [this, graph_def_version](const NodeDef& ndef,
-                                                   OpKernel** kernel) {
-    return CreateNonCachedKernel(device_.get(), nullptr, ndef,
+  params.create_kernel = [this, graph_def_version](
+                             const std::shared_ptr<const NodeProperties>& props,
+                             OpKernel** kernel) {
+    return CreateNonCachedKernel(device_.get(), nullptr, props,
                                  graph_def_version, kernel);
   };
   params.delete_kernel = [](OpKernel* kernel) {
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 9b28651c597..96fc4f3d4f3 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -233,23 +233,25 @@ Status GraphMgr::InitItem(
     // Construct the root executor for the subgraph.
     params.device = unit->device;
     params.function_library = lib;
-    params.create_kernel = [handle, lib, opseg](const NodeDef& ndef,
-                                                OpKernel** kernel) {
-      // NOTE(mrry): We must not share function kernels (implemented
-      // using `CallOp`) between subgraphs, because `CallOp::handle_`
-      // is tied to a particular subgraph. Even if the function itself
-      // is stateful, the `CallOp` that invokes it is not.
-      if (!OpSegment::ShouldOwnKernel(lib, ndef.op())) {
-        return lib->CreateKernel(ndef, kernel);
-      }
-      auto create_fn = [lib, &ndef](OpKernel** kernel) {
-        return lib->CreateKernel(ndef, kernel);
-      };
-      // Kernels created for subgraph nodes need to be cached.  On
-      // cache miss, create_fn() is invoked to create a kernel based
-      // on the function library here + global op registry.
-      return opseg->FindOrCreate(handle, ndef.name(), kernel, create_fn);
-    };
+    params.create_kernel =
+        [handle, lib, opseg](const std::shared_ptr<const NodeProperties>& props,
+                             OpKernel** kernel) {
+          // NOTE(mrry): We must not share function kernels (implemented
+          // using `CallOp`) between subgraphs, because `CallOp::handle_`
+          // is tied to a particular subgraph. Even if the function itself
+          // is stateful, the `CallOp` that invokes it is not.
+          if (!OpSegment::ShouldOwnKernel(lib, props->node_def.op())) {
+            return lib->CreateKernel(props, kernel);
+          }
+          auto create_fn = [lib, &props](OpKernel** kernel) {
+            return lib->CreateKernel(props, kernel);
+          };
+          // Kernels created for subgraph nodes need to be cached.  On
+          // cache miss, create_fn() is invoked to create a kernel based
+          // on the function library here + global op registry.
+          return opseg->FindOrCreate(handle, props->node_def.name(), kernel,
+                                     create_fn);
+        };
     params.delete_kernel = [lib](OpKernel* kernel) {
       if (kernel && !OpSegment::ShouldOwnKernel(lib, kernel->type_string())) {
         delete kernel;
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index 003e4894788..f3207dd657a 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -129,6 +129,7 @@ exports_files(
         "attr_value_util.h",
         "common_shape_fns.h",
         "node_def_util.h",
+        "node_properties.h",
         "op.h",
         "op_def_builder.h",
         "op_def_util.h",
@@ -172,6 +173,7 @@ filegroup(
         "model.h",
         "node_def_builder.h",
         "node_def_util.h",
+        "node_properties.h",
         "numeric_op.h",
         "numeric_types.h",
         "op.h",
@@ -338,6 +340,8 @@ filegroup(
         "node_def_builder.h",
         "node_def_util.cc",
         "node_def_util.h",
+        "node_properties.cc",
+        "node_properties.h",
         "numeric_op.h",
         "op.cc",
         "op.h",
@@ -862,6 +866,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "node_properties",
+    srcs = ["node_properties.cc"],
+    hdrs = ["node_properties.h"],
+    deps = [
+        ":node_def_proto_cc",
+        ":node_def_util",
+        ":op",
+        ":op_def_proto_cc",
+        ":tensor",
+        ":types_proto_cc",
+        "//tensorflow/core/lib/core:status",
+    ],
+)
+
 cc_library(
     name = "op_def_builder",
     srcs = ["op_def_builder.cc"],
@@ -967,6 +986,7 @@ tf_cc_tests(
         "model_test.cc",
         "node_def_builder_test.cc",
         "node_def_util_test.cc",
+        "node_properties_test.cc",
         "op_compatibility_test.cc",
         "op_def_builder_test.cc",
         "op_def_util_test.cc",
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 0e260d26592..58cc1bbdaf9 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -722,11 +722,13 @@ class FunctionLibraryRuntime {
   virtual void Run(const Options& opts, Handle handle,
                    CallFrameInterface* call_frame, DoneCallback done) = 0;
 
-  // Creates a "kernel" for the given node def "ndef".
+  // Creates a "kernel" for the given NodeProperties "props".
   //
   // If succeeds, returns OK and the caller takes the ownership of the
   // returned "*kernel". Otherwise, returns an error.
-  virtual Status CreateKernel(const NodeDef& ndef, OpKernel** kernel) = 0;
+  virtual Status CreateKernel(
+      const std::shared_ptr<const NodeProperties>& props,
+      OpKernel** kernel) = 0;
 
   // Returns true iff the function named `function_name` is stateful.
   //
@@ -818,12 +820,15 @@ class CustomKernelCreator {
 
   // Given a NodeDef 'node_def' and the function library runtime 'flr',
   // validate if the class supports creating such a kernel.
-  virtual bool CanCreateKernel(const FunctionLibraryRuntime& flr,
-                               const NodeDef& node_def) const = 0;
+  virtual bool CanCreateKernel(
+      const FunctionLibraryRuntime& flr,
+      const std::shared_ptr<const NodeProperties>& props) const = 0;
 
   // Given a supported NodeDef, returns a kernel that computes the node.
-  virtual Status CreateKernel(FunctionLibraryRuntime* flr, const NodeDef& ndef,
-                              std::unique_ptr<OpKernel>* kernel) const = 0;
+  virtual Status CreateKernel(
+      FunctionLibraryRuntime* flr,
+      const std::shared_ptr<const NodeProperties>& props,
+      std::unique_ptr<OpKernel>* kernel) const = 0;
 };
 
 // Used to instantiate and run functions in a distributed system.
diff --git a/tensorflow/core/framework/node_properties.cc b/tensorflow/core/framework/node_properties.cc
new file mode 100644
index 00000000000..bcc81bdbbff
--- /dev/null
+++ b/tensorflow/core/framework/node_properties.cc
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/node_properties.h"
+
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+// static
+Status NodeProperties::CreateFromNodeDef(
+    NodeDef node_def, const OpRegistryInterface* op_registry,
+    std::shared_ptr<const NodeProperties>* props) {
+  const OpDef* op_def;
+  TF_RETURN_IF_ERROR(op_registry->LookUpOpDef(node_def.op(), &op_def));
+  DataTypeVector input_types;
+  DataTypeVector output_types;
+  TF_RETURN_IF_ERROR(
+      InOutTypesForNode(node_def, *op_def, &input_types, &output_types));
+  props->reset(new NodeProperties(op_def, std::move(node_def),
+                                  std::move(input_types),
+                                  std::move(output_types)));
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/node_properties.h b/tensorflow/core/framework/node_properties.h
new file mode 100644
index 00000000000..0382321f486
--- /dev/null
+++ b/tensorflow/core/framework/node_properties.h
@@ -0,0 +1,63 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_NODE_PROPERTIES_H_
+#define TENSORFLOW_CORE_FRAMEWORK_NODE_PROPERTIES_H_
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class OpRegistryInterface;
+
+struct NodeProperties {
+ public:
+  NodeProperties(const OpDef* op_def, NodeDef node_def,
+                 const DataTypeSlice inputs, const DataTypeSlice outputs)
+      : NodeProperties(op_def, std::move(node_def),
+                       DataTypeVector(inputs.begin(), inputs.end()),
+                       DataTypeVector(outputs.begin(), outputs.end())) {}
+
+  NodeProperties(const OpDef* _op_def, NodeDef&& _node_def,
+                 DataTypeVector inputs, DataTypeVector outputs)
+      : op_def(_op_def),
+        node_def(std::move(_node_def)),
+        input_types(std::move(inputs)),
+        input_types_slice(input_types),
+        output_types(std::move(outputs)),
+        output_types_slice(output_types) {}
+
+  // Resets the 'props' shared pointer to point to a new NodeProperties created
+  // from the given NodeDef. 'op_registry' is used to look up the OpDef
+  // corresponding to node_def.op(). Returns an error if OpDef lookup or
+  // creation failed.
+  static Status CreateFromNodeDef(NodeDef node_def,
+                                  const OpRegistryInterface* op_registry,
+                                  std::shared_ptr<const NodeProperties>* props);
+
+  const OpDef* op_def;  // not owned.
+  NodeDef node_def;
+  DataTypeVector input_types;
+  DataTypeSlice input_types_slice;
+  DataTypeVector output_types;
+  DataTypeSlice output_types_slice;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_NODE_PROPERTIES_H_
diff --git a/tensorflow/core/framework/node_properties_test.cc b/tensorflow/core/framework/node_properties_test.cc
new file mode 100644
index 00000000000..9f76b953b06
--- /dev/null
+++ b/tensorflow/core/framework/node_properties_test.cc
@@ -0,0 +1,128 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/node_properties.h"
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+OpDef ToOpDef(const OpDefBuilder& builder) {
+  OpRegistrationData op_reg_data;
+  EXPECT_TRUE(builder.Finalize(&op_reg_data).ok());
+  return op_reg_data.op_def;
+}
+
+class MockOpRegistry : public OpRegistryInterface {
+ public:
+  MockOpRegistry()
+      : op_reg_(ToOpDef(OpDefBuilder("Foo")
+                            .Input("f: float")
+                            .Input("i: int32")
+                            .Output("of: double"))) {}
+  ~MockOpRegistry() override {}
+
+  // Returns an error status and sets *op_reg_data to nullptr if no OpDef is
+  // registered under that name, otherwise returns the registered OpDef.
+  // Caller must not delete the returned pointer.
+  Status LookUp(const string& op_type_name,
+                const OpRegistrationData** op_reg_data) const override {
+    if (op_type_name == "Foo") {
+      *op_reg_data = &op_reg_;
+      return Status::OK();
+    } else {
+      *op_reg_data = nullptr;
+      return errors::InvalidArgument("Op type named ", op_type_name,
+                                     " not found");
+    }
+  }
+
+  const OpDef* get_op_def_addr() { return &op_reg_.op_def; }
+
+ private:
+  const OpRegistrationData op_reg_;
+};
+
+void ValidateNodeProperties(const NodeProperties& props, const OpDef* op_def,
+                            const NodeDef& node_def,
+                            const DataTypeVector& input_types,
+                            const DataTypeVector& output_types) {
+  EXPECT_EQ(props.op_def, op_def);
+  EXPECT_EQ(props.node_def.name(), node_def.name());
+  ASSERT_EQ(props.input_types.size(), input_types.size());
+  for (int i = 0; i < input_types.size(); ++i) {
+    EXPECT_EQ(props.input_types[i], input_types[i]);
+    EXPECT_EQ(props.input_types_slice[i], input_types[i]);
+  }
+  ASSERT_EQ(props.output_types.size(), output_types.size());
+  for (int i = 0; i < output_types.size(); ++i) {
+    EXPECT_EQ(props.output_types[i], output_types[i]);
+    EXPECT_EQ(props.output_types_slice[i], output_types[i]);
+  }
+}
+
+}  // namespace
+
+TEST(NodeProperties, Contructors) {
+  OpDef op_def;
+  NodeDef node_def;
+  node_def.set_name("foo");
+  DataTypeVector input_types{DT_FLOAT, DT_INT32};
+  DataTypeVector output_types{DT_DOUBLE};
+  DataTypeSlice input_types_slice(input_types);
+  DataTypeSlice output_types_slice(output_types);
+
+  // Construct from slices.
+  NodeProperties props_from_slices(&op_def, node_def, input_types_slice,
+                                   output_types_slice);
+  ValidateNodeProperties(props_from_slices, &op_def, node_def, input_types,
+                         output_types);
+
+  // Construct from vectors.
+  NodeProperties props_from_vectors(&op_def, node_def, input_types,
+                                    output_types);
+  ValidateNodeProperties(props_from_vectors, &op_def, node_def, input_types,
+                         output_types);
+}
+
+TEST(NodeProperties, CreateFromNodeDef) {
+  MockOpRegistry op_registry;
+  NodeDef node_def;
+  node_def.set_name("bar");
+  node_def.set_op("Foo");
+  node_def.add_input("f_in");
+  node_def.add_input("i_in");
+
+  std::shared_ptr<const NodeProperties> props;
+  EXPECT_TRUE(
+      NodeProperties::CreateFromNodeDef(node_def, &op_registry, &props).ok());
+
+  DataTypeVector input_types{DT_FLOAT, DT_INT32};
+  DataTypeVector output_types{DT_DOUBLE};
+  ValidateNodeProperties(*props, op_registry.get_op_def_addr(), node_def,
+                         input_types, output_types);
+
+  // The OpDef lookup should fail for this one:
+  node_def.set_op("Baz");
+  std::shared_ptr<const NodeProperties> props_bad;
+  EXPECT_FALSE(
+      NodeProperties::CreateFromNodeDef(node_def, &op_registry, &props_bad)
+          .ok());
+  EXPECT_EQ(props_bad, nullptr);
+}
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 2feb84a1786..38c56eb3b1c 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -35,9 +35,9 @@ limitations under the License.
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/node_properties.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -91,35 +91,53 @@ Status MatchSignatureHelper(const DataTypeSlice expected_inputs,
 
 // OpKernel ------------------------------------------------------------------
 
-OpKernel::OpKernel(OpKernelConstruction* context)
-    : OpKernel(context, MakeUnique<const NodeDef>(context->def())) {}
+OpKernel::OpKernel(OpKernelConstruction* context) : OpKernel(context, false) {}
 
 OpKernel::OpKernel(OpKernelConstruction* context, bool is_deferred)
-    : OpKernel(context, MakeUnique<const NodeDef>(context->def()),
-               is_deferred) {}
-
-OpKernel::OpKernel(OpKernelConstruction* context,
-                   std::unique_ptr<const NodeDef> node_def, bool is_deferred)
-    : def_(std::move(node_def)),
-      input_types_(context->input_types().begin(),
-                   context->input_types().end()),
+    : props_(context->props_),
       input_memory_types_(context->input_memory_types().begin(),
                           context->input_memory_types().end()),
-      output_types_(context->output_types().begin(),
-                    context->output_types().end()),
       output_memory_types_(context->output_memory_types().begin(),
                            context->output_memory_types().end()),
       input_name_map_(context->num_inputs()),
       output_name_map_(context->num_outputs()),
-      name_view_(def_->name()),
-      type_string_view_(def_->op()),
+      name_view_(props_->node_def.name()),
+      type_string_view_(props_->node_def.op()),
       graph_def_version_(context->graph_def_version()),
       is_deferred_(is_deferred),
       cost_estimate_(OpKernel::kInitialCostEstimateCycles) {
   OP_REQUIRES_OK(context,
-                 NameRangesForNode(*def_, *context->op_def_, &input_name_map_,
-                                   &output_name_map_));
-  OP_REQUIRES_OK(context, CheckOpDeprecation(*context->op_def_,
+                 NameRangesForNode(props_->node_def, *props_->op_def,
+                                   &input_name_map_, &output_name_map_));
+  OP_REQUIRES_OK(context, CheckOpDeprecation(*props_->op_def,
+                                             context->graph_def_version()));
+
+  // Kernels executing on GPU/SYCL tie very few resources on the CPU where the
+  // scheduler runs: we consider them as inexpensive.
+  expensive_ = context->device_type() != DeviceType(DEVICE_GPU) &&
+               context->device_type() != DeviceType(DEVICE_SYCL);
+}
+
+OpKernel::OpKernel(OpKernelConstruction* context, NodeDef&& custom_def,
+                   bool is_deferred)
+    : props_(std::make_shared<const NodeProperties>(
+          context->props_->op_def, std::move(custom_def),
+          context->props_->input_types, context->props_->output_types)),
+      input_memory_types_(context->input_memory_types().begin(),
+                          context->input_memory_types().end()),
+      output_memory_types_(context->output_memory_types().begin(),
+                           context->output_memory_types().end()),
+      input_name_map_(context->num_inputs()),
+      output_name_map_(context->num_outputs()),
+      name_view_(props_->node_def.name()),
+      type_string_view_(props_->node_def.op()),
+      graph_def_version_(context->graph_def_version()),
+      is_deferred_(is_deferred),
+      cost_estimate_(OpKernel::kInitialCostEstimateCycles) {
+  OP_REQUIRES_OK(context,
+                 NameRangesForNode(props_->node_def, *props_->op_def,
+                                   &input_name_map_, &output_name_map_));
+  OP_REQUIRES_OK(context, CheckOpDeprecation(*props_->op_def,
                                              context->graph_def_version()));
 
   // Kernels executing on GPU/SYCL tie very few resources on the CPU where the
@@ -134,10 +152,6 @@ const uint64 OpKernel::kInitialCostEstimateCycles;
 const uint64 OpKernel::kOpIsExpensiveThresholdCycles;
 const uint64 OpKernel::kCostDecay;
 
-const string& OpKernel::name() const { return def_->name(); }
-const string& OpKernel::type_string() const { return def_->op(); }
-const string& OpKernel::requested_device() const { return def_->device(); }
-const string& OpKernel::requested_input(int i) const { return def_->input(i); }
 
 Status OpKernel::InputRange(StringPiece input_name, int* start,
                             int* stop) const {
@@ -216,22 +230,18 @@ Tensor* PersistentTensor::AccessTensor(OpKernelContext* context) {
 
 OpKernelConstruction::OpKernelConstruction(
     DeviceType device_type, DeviceBase* device, Allocator* allocator,
-    const NodeDef* node_def, const OpDef* op_def, FunctionLibraryRuntime* flib,
-    ResourceMgr* resource_mgr, const DataTypeSlice& input_types,
+    FunctionLibraryRuntime* flib, ResourceMgr* resource_mgr,
+    const std::shared_ptr<const NodeProperties>& props,
     const MemoryTypeSlice& input_memory_types,
-    const DataTypeSlice& output_types,
     const MemoryTypeSlice& output_memory_types, int graph_def_version,
     Status* status)
     : device_type_(std::move(device_type)),
       device_(device),
       allocator_(allocator),
-      def_(node_def),
-      op_def_(op_def),
       flib_(flib),
       resource_mgr_(resource_mgr),
-      input_types_(input_types),
+      props_(props),
       input_memory_types_(input_memory_types),
-      output_types_(output_types),
       output_memory_types_(output_memory_types),
       graph_def_version_(graph_def_version),
       status_(status) {}
@@ -246,8 +256,8 @@ void OpKernelConstruction::SetStatus(const Status& status) {
 
 Status OpKernelConstruction::MatchSignature(
     const DataTypeSlice expected_inputs, const DataTypeSlice expected_outputs) {
-  return MatchSignatureHelper(expected_inputs, expected_outputs, input_types_,
-                              output_types_);
+  return MatchSignatureHelper(expected_inputs, expected_outputs,
+                              props_->input_types, props_->output_types);
 }
 
 Status OpKernelConstruction::allocate_temp(DataType type,
@@ -263,7 +273,7 @@ Status OpKernelConstruction::allocate_temp(DataType type,
   }
   if (LogMemory::IsEnabled()) {
     LogMemory::RecordTensorAllocation(
-        def_->name(), LogMemory::OP_KERNEL_CONSTRUCTION_STEP_ID, new_temp);
+        def().name(), LogMemory::OP_KERNEL_CONSTRUCTION_STEP_ID, new_temp);
   }
   *out_temp = new_temp;
   return Status::OK();
@@ -288,7 +298,7 @@ Status OpKernelConstruction::allocate_temp(DataType type,
   }
   if (LogMemory::IsEnabled()) {
     LogMemory::RecordTensorAllocation(
-        def_->name(), LogMemory::OP_KERNEL_CONSTRUCTION_STEP_ID, new_temp);
+        def().name(), LogMemory::OP_KERNEL_CONSTRUCTION_STEP_ID, new_temp);
   }
   *out_temp = new_temp;
   return Status::OK();
@@ -1544,45 +1554,65 @@ string KernelsRegisteredForOp(StringPiece op_name) {
   return ret;
 }
 
+/* TODO(rmlarsen): This API is deprecated. Remove it if possible to avoid
+ * copying the NodeDef. */
 std::unique_ptr<OpKernel> CreateOpKernel(
     DeviceType device_type, DeviceBase* device, Allocator* allocator,
     const NodeDef& node_def, int graph_def_version, Status* status) {
+  // Look up the Op registered for this op name.
+  std::shared_ptr<const NodeProperties> props;
+  status->Update(NodeProperties::CreateFromNodeDef(
+      node_def, OpRegistry::Global(), &props));
+  if (!status->ok()) {
+    errors::AppendToMessage(status,
+                            " for node: ", FormatNodeDefForError(node_def));
+    return nullptr;
+  }
+  return CreateOpKernel(device_type, device, allocator, props,
+                        graph_def_version, status);
+}
+
+std::unique_ptr<OpKernel> CreateOpKernel(
+    DeviceType device_type, DeviceBase* device, Allocator* allocator,
+    const std::shared_ptr<const NodeProperties>& props, int graph_def_version,
+    Status* status) {
   OpKernel* kernel = nullptr;
-  *status = CreateOpKernel(std::move(device_type), device, allocator, nullptr,
-                           node_def, graph_def_version, &kernel);
+  *status = CreateOpKernel(std::move(device_type), device, allocator,
+                           /*flib=*/nullptr, props, graph_def_version, &kernel);
   return std::unique_ptr<OpKernel>(kernel);
 }
 
 Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
                       Allocator* allocator, FunctionLibraryRuntime* flib,
-                      const NodeDef& node_def, int graph_def_version,
-                      OpKernel** kernel) {
+                      const std::shared_ptr<const NodeProperties>& props,
+                      int graph_def_version, OpKernel** kernel) {
   return CreateOpKernel(std::move(device_type), device, allocator, flib,
-                        /* resource_mgr= */ nullptr, node_def,
-                        graph_def_version, kernel);
+                        /* resource_mgr= */ nullptr, props, graph_def_version,
+                        kernel);
 }
 
 Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
                       Allocator* allocator, FunctionLibraryRuntime* flib,
-                      ResourceMgr* resource_mgr, const NodeDef& node_def,
+                      ResourceMgr* resource_mgr,
+                      const std::shared_ptr<const NodeProperties>& props,
                       int graph_def_version, OpKernel** kernel) {
-  VLOG(1) << "Instantiating kernel for node: " << SummarizeNodeDef(node_def);
-
-  // Look up the Op registered for this op name.
-  const OpDef* op_def = nullptr;
-  TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUpOpDef(node_def.op(), &op_def));
-
-  // Validate node_def against OpDef.
-  TF_RETURN_IF_ERROR(ValidateNodeDef(node_def, *op_def));
-
-  // Look up kernel registration.
-  const KernelRegistration* registration;
+  const NodeDef& node_def = props->node_def;
   bool was_attr_mismatch;
-  Status s = FindKernelRegistration(device_type, node_def, &registration,
-                                    &was_attr_mismatch);
-  if (!s.ok()) {
-    errors::AppendToMessage(&s, " when instantiating ", node_def.op());
-    return s;
+  const KernelRegistration* registration = nullptr;
+  Status s;
+  if (props != nullptr) {
+    VLOG(1) << "Instantiating kernel for node: " << SummarizeNodeDef(node_def);
+
+    // Validate node_def against OpDef.
+    TF_RETURN_IF_ERROR(ValidateNodeDef(node_def, *props->op_def));
+
+    // Look up kernel registration.
+    s = FindKernelRegistration(device_type, node_def, &registration,
+                               &was_attr_mismatch);
+    if (!s.ok()) {
+      errors::AppendToMessage(&s, " when instantiating ", node_def.op());
+      return s;
+    }
   }
   if (registration == nullptr) {
     s.Update(errors::NotFound("No registered '", node_def.op(),
@@ -1599,15 +1629,6 @@ Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
     return s;
   }
 
-  // Get signature from the OpDef & NodeDef
-  DataTypeVector inputs;
-  DataTypeVector outputs;
-  s.Update(InOutTypesForNode(node_def, *op_def, &inputs, &outputs));
-  if (!s.ok()) {
-    errors::AppendToMessage(&s, " for node: ", FormatNodeDefForError(node_def));
-    return s;
-  }
-
   // We are creating a kernel for an op registered in
   // OpRegistry::Global(), we consult the kernel registry to decide
   // the kernel's input and output memory types.
@@ -1618,10 +1639,9 @@ Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
                                         &output_memory_types));
 
   // Everything needed for OpKernel construction.
-  OpKernelConstruction context(std::move(device_type), device, allocator,
-                               &node_def, op_def, flib, resource_mgr, inputs,
-                               input_memory_types, outputs, output_memory_types,
-                               graph_def_version, &s);
+  OpKernelConstruction context(std::move(device_type), device, allocator, flib,
+                               resource_mgr, props, input_memory_types,
+                               output_memory_types, graph_def_version, &s);
   *kernel = registration->factory->Create(&context);
   if (!s.ok()) {
     delete *kernel;
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 4f1cc91cd19..e0d9742768a 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/node_properties.h"
 #include "tensorflow/core/framework/op.h"  // TODO(b/62899350): Remove
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/selective_registration.h"
@@ -85,19 +86,18 @@ class OpKernel {
   // expensive initialization in the descendant's constructor.
   explicit OpKernel(OpKernelConstruction* context);
 
-  // Specialized constructor that enables the descendant to provide a different
-  // `NodeDef` value. For example, this constructor can be used to provide a
-  // stripped-down `NodeDef` that does not contain the full set of attrs (such
-  // as tensor values) if the descendant stores them in a different form.
-  explicit OpKernel(OpKernelConstruction* context,
-                    std::unique_ptr<const NodeDef> node_def,
-                    bool is_deferred = false);
-
   // Specialized constructor that allows a kernel implementation to mark itself
   // as a "deferred" op. If true, the executor will provide access to the
   // `OpKernelContext::inc_num_deferred_ops_function()` and
   // `OpKernelContext::dec_num_deferred_ops_function()` methods at run-time.
-  explicit OpKernel(OpKernelConstruction* context, bool is_deferred);
+  OpKernel(OpKernelConstruction* context, bool is_deferred);
+
+  // Specialized constructor that enables the descendant to provide a custom
+  // `NodeDef` value. For example, this constructor can be used to provide a
+  // stripped-down `NodeDef` that does not contain the full set of attrs (such
+  // as tensor values) if the descendant stores them in a different form.
+  OpKernel(OpKernelConstruction* context, NodeDef&& custom_def,
+           bool is_deferred);
 
   virtual ~OpKernel();
 
@@ -170,24 +170,26 @@ class OpKernel {
   }
 
   // Accessors.
-  const NodeDef& def() const { return *def_; }
-  const string& name() const;              // Same as def().name()
+  const NodeDef& def() const { return props_->node_def; }
+  const string& name() const { return props_->node_def.name(); }
   absl::string_view name_view() const { return name_view_; }
-  const string& type_string() const;       // Same as def().op()
+  const string& type_string() const { return props_->node_def.op(); }
   absl::string_view type_string_view() const { return type_string_view_; }
-  const string& requested_device() const;  // Same as def().device()
+  const string& requested_input(int i) const {
+    return props_->node_def.input(i);
+  }
+  const string& requested_device() const { return props_->node_def.device(); }
 
-  int num_inputs() const { return input_types_.size(); }
-  DataType input_type(int i) const { return input_types_[i]; }
-  const DataTypeVector& input_types() const { return input_types_; }
+  int num_inputs() const { return props_->input_types.size(); }
+  DataType input_type(int i) const { return props_->input_types[i]; }
+  const DataTypeVector& input_types() const { return props_->input_types; }
   const MemoryTypeVector& input_memory_types() const {
     return input_memory_types_;
   }
-  const string& requested_input(int i) const;  // Same as def().input(i)
 
-  int num_outputs() const { return output_types_.size(); }
-  DataType output_type(int o) const { return output_types_[o]; }
-  const DataTypeVector& output_types() const { return output_types_; }
+  int num_outputs() const { return props_->output_types.size(); }
+  DataType output_type(int o) const { return props_->output_types[o]; }
+  const DataTypeVector& output_types() const { return props_->output_types; }
   const MemoryTypeVector& output_memory_types() const {
     return output_memory_types_;
   }
@@ -209,10 +211,8 @@ class OpKernel {
   string GetTraceArgument(OpKernelContext* ctx);
 
  private:
-  const std::unique_ptr<const NodeDef> def_;
-  const DataTypeVector input_types_;
+  const std::shared_ptr<const NodeProperties> props_;
   const MemoryTypeVector input_memory_types_;
-  const DataTypeVector output_types_;
   const MemoryTypeVector output_memory_types_;
   NameRangeMap input_name_map_;
   NameRangeMap output_name_map_;
@@ -284,12 +284,10 @@ class PersistentTensor {
 class OpKernelConstruction {
  public:
   OpKernelConstruction(DeviceType device_type, DeviceBase* device,
-                       Allocator* allocator, const NodeDef* node_def,
-                       const OpDef* op_def, FunctionLibraryRuntime* flib,
+                       Allocator* allocator, FunctionLibraryRuntime* flib,
                        ResourceMgr* resource_mgr,
-                       const DataTypeSlice& input_types,
+                       const std::shared_ptr<const NodeProperties>& props,
                        const MemoryTypeSlice& input_memory_types,
-                       const DataTypeSlice& output_types,
                        const MemoryTypeSlice& output_memory_types,
                        int graph_def_version, Status* status);
 
@@ -330,20 +328,22 @@ class OpKernelConstruction {
                              Tensor** out_tensor);
 
   // User-supplied configuration of this operation.
-  const NodeDef& def() const { return *def_; }
+  const NodeDef& def() const { return props_->node_def; }
 
   // For inspecting the inputs to this operation.
-  int num_inputs() const { return input_types_.size(); }
-  DataType input_type(int i) const { return input_types_[i]; }
-  const DataTypeSlice& input_types() const { return input_types_; }
+  int num_inputs() const { return props_->input_types.size(); }
+  DataType input_type(int i) const { return props_->input_types[i]; }
+  const DataTypeSlice& input_types() const { return props_->input_types_slice; }
   const MemoryTypeSlice& input_memory_types() const {
     return input_memory_types_;
   }
 
   // For inspecting the outputs expected from this operation.
-  int num_outputs() const { return output_types_.size(); }
-  DataType output_type(int i) const { return output_types_[i]; }
-  const DataTypeSlice& output_types() const { return output_types_; }
+  int num_outputs() const { return props_->output_types.size(); }
+  DataType output_type(int i) const { return props_->output_types[i]; }
+  const DataTypeSlice& output_types() const {
+    return props_->output_types_slice;
+  }
   const MemoryTypeSlice& output_memory_types() const {
     return output_memory_types_;
   }
@@ -403,19 +403,15 @@ class OpKernelConstruction {
   const DeviceType device_type_;
   DeviceBase* const device_;
   Allocator* allocator_;
-  const NodeDef* def_;
-  const OpDef* op_def_;
   FunctionLibraryRuntime* flib_;
   ResourceMgr* const resource_mgr_;
-  DataTypeSlice input_types_;
+  std::shared_ptr<const NodeProperties> props_;
   MemoryTypeSlice input_memory_types_;
-  DataTypeSlice output_types_;
   MemoryTypeSlice output_memory_types_;
   const int graph_def_version_;
   Status* status_;
 
-  // Allow op_def_ across from OpKernel, but not from subclasses.
-  // TODO(irving): Remove protos from this header entirely.
+  // Allow access from OpKernel ctor.
   friend class OpKernel;
 
   TF_DISALLOW_COPY_AND_ASSIGN(OpKernelConstruction);
@@ -1404,15 +1400,23 @@ const Eigen::SyclDevice& OpKernelContext::eigen_device() const;
 std::unique_ptr<OpKernel> CreateOpKernel(DeviceType device_type,
                                          DeviceBase* device,
                                          Allocator* allocator,
-                                         const NodeDef& def,
+                                         const NodeDef& node_def,
                                          int graph_def_version, Status* status);
+
+std::unique_ptr<OpKernel> CreateOpKernel(
+    DeviceType device_type, DeviceBase* device, Allocator* allocator,
+    const std::shared_ptr<const NodeProperties>& props, int graph_def_version,
+    Status* status);
+
 Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
                       Allocator* allocator, FunctionLibraryRuntime* flib,
-                      const NodeDef& def, int graph_def_version,
-                      OpKernel** kernel);
+                      const std::shared_ptr<const NodeProperties>& props,
+                      int graph_def_version, OpKernel** kernel);
+
 Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
                       Allocator* allocator, FunctionLibraryRuntime* flib,
-                      ResourceMgr* resource_mgr, const NodeDef& def,
+                      ResourceMgr* resource_mgr,
+                      const std::shared_ptr<const NodeProperties>& props,
                       int graph_def_version, OpKernel** kernel);
 
 // Returns into 'device_types' the subset of prioritized_types that this
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index ec887a0ad93..40425cf24e0 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 6240d0fb1ca..1f8a4d06c7a 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/node_properties.h"
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/versions.pb.h"
@@ -37,23 +37,7 @@ namespace tensorflow {
 
 const int Graph::kControlSlot = -1;
 
-struct NodeProperties {
- public:
-  NodeProperties(const OpDef* op_def, NodeDef node_def,
-                 const DataTypeSlice inputs, const DataTypeSlice outputs)
-      : op_def(op_def),
-        node_def(std::move(node_def)),
-        input_types(inputs.begin(), inputs.end()),
-        output_types(outputs.begin(), outputs.end()) {}
-
-  const OpDef* op_def;  // not owned
-  NodeDef node_def;
-  const DataTypeVector input_types;
-  const DataTypeVector output_types;
-};
-
 // Node
-
 #define REF_CLASS(key, value) \
   {key, value}, { "Ref" key, value }
 
@@ -97,7 +81,8 @@ const std::unordered_map<string, Node::NodeClass>& Node::kNodeClassTable =
         {"StatelessIf", NC_IF},
         {"While", NC_WHILE},
         {"StatelessWhile", NC_WHILE},
-        // Not using the constants defined in FunctionLibraryDefinition for the
+        // Not using the constants defined in FunctionLibraryDefinition
+        // for the
         // 4 ops below because android inference library does not link
         // tf.function related files.
         {"_Arg", NC_ARG},
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index b33c0319c75..235d944bd60 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -43,6 +43,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/edgeset.h"
@@ -67,7 +68,6 @@ class WhileContext;
 
 class NeighborIter;     // Declared below
 class NodeIter;         // Declared below
-struct NodeProperties;  // Defined in .cc
 
 class Node {
  public:
@@ -229,11 +229,12 @@ class Node {
     while_ctx_ = while_ctx;
   }
 
+  std::shared_ptr<NodeProperties> properties() const { return props_; }
+
  private:
   friend class Graph;
   Node();
 
-  NodeProperties* properties() const { return props_.get(); }
 
   void Initialize(int id, int cost_id, std::shared_ptr<NodeProperties> props,
                   bool is_function_op);
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 5931599c6e2..ccdafdf91c9 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -47,31 +47,30 @@ namespace tensorflow {
 
 namespace {
 
-std::unique_ptr<const NodeDef> StripTensorDataFromNodeDef(
-    OpKernelConstruction* ctx) {
+NodeDef StripTensorDataFromNodeDef(OpKernelConstruction* ctx) {
 #ifndef __ANDROID__
   DCHECK_EQ(NodeDef::descriptor()->field_count(), 6)
       << "The NodeDef format has changed, and the attr-stripping code may need "
       << "to be updated.";
 #endif
   const NodeDef& original = ctx->def();
-  NodeDef* ret = new NodeDef;
-  ret->set_name(original.name());
-  ret->set_op(original.op());
-  ret->set_device(original.device());
+  NodeDef ret;
+  ret.set_name(original.name());
+  ret.set_op(original.op());
+  ret.set_device(original.device());
   // Strip the "value" attr from the returned NodeDef.
   // NOTE(mrry): The present implementation of `OpKernel::OpKernel()` only uses
   // attrs that affect the cardinality of list-typed inputs and outputs, so it
   // is safe to drop other attrs from the NodeDef.
-  AddNodeAttr("dtype", ctx->output_type(0), ret);
-  MergeDebugInfo(original, ret);
-  return std::unique_ptr<const NodeDef>(ret);
+  AddNodeAttr("dtype", ctx->output_type(0), &ret);
+  MergeDebugInfo(original, &ret);
+  return ret;
 }
 
 }  // namespace
 
 ConstantOp::ConstantOp(OpKernelConstruction* ctx)
-    : OpKernel(ctx, StripTensorDataFromNodeDef(ctx)),
+    : OpKernel(ctx, StripTensorDataFromNodeDef(ctx), false),
       tensor_(ctx->output_type(0)) {
   const TensorProto* proto = nullptr;
   MEMDEBUG_CACHE_OP(ctx->def().name().c_str());
diff --git a/tensorflow/core/kernels/data/dataset_test_base.cc b/tensorflow/core/kernels/data/dataset_test_base.cc
index 7c5d0c3f679..817e075e69b 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.cc
+++ b/tensorflow/core/kernels/data/dataset_test_base.cc
@@ -304,9 +304,14 @@ Status DatasetOpsTestBase::ExpectEqual(std::vector<Tensor> produced_tensors,
 Status DatasetOpsTestBase::CreateOpKernel(
     const NodeDef& node_def, std::unique_ptr<OpKernel>* op_kernel) {
   OpKernel* kernel;
+  Status s;
+
+  std::shared_ptr<const NodeProperties> props;
+  TF_RETURN_IF_ERROR(NodeProperties::CreateFromNodeDef(
+      node_def, flr_->GetFunctionLibraryDefinition(), &props));
   TF_RETURN_IF_ERROR(tensorflow::CreateOpKernel(
       device_type_, device_.get(), allocator_, flr_,
-      device_->resource_manager(), node_def, TF_GRAPH_DEF_VERSION, &kernel));
+      device_->resource_manager(), props, TF_GRAPH_DEF_VERSION, &kernel));
   op_kernel->reset(kernel);
   return Status::OK();
 }
@@ -435,9 +440,10 @@ Status DatasetOpsTestBase::RunFunction(
   LocalExecutorParams params;
   params.function_library = flr_;
   params.device = device_.get();
-  params.create_kernel = [this, version](const NodeDef& ndef,
-                                         OpKernel** kernel) {
-    return CreateNonCachedKernel(device_.get(), this->flr_, ndef, version,
+  params.create_kernel = [this, version](
+                             const std::shared_ptr<const NodeProperties>& props,
+                             OpKernel** kernel) {
+    return CreateNonCachedKernel(device_.get(), this->flr_, props, version,
                                  kernel);
   };
   params.delete_kernel = [](OpKernel* kernel) {
diff --git a/tensorflow/core/kernels/data/single_threaded_executor.cc b/tensorflow/core/kernels/data/single_threaded_executor.cc
index a6b31679fa6..5393d5557eb 100644
--- a/tensorflow/core/kernels/data/single_threaded_executor.cc
+++ b/tensorflow/core/kernels/data/single_threaded_executor.cc
@@ -108,7 +108,8 @@ class SingleThreadedExecutorImpl : public Executor {
       KernelState& kernel_state = kernels_[kernel_index];
       node_to_index_map[n] = kernel_index;
 
-      TF_RETURN_IF_ERROR(params_.create_kernel(n->def(), &kernel_state.kernel));
+      TF_RETURN_IF_ERROR(
+          params_.create_kernel(n->properties(), &kernel_state.kernel));
       kernel_state.num_inputs = n->num_inputs();
       kernel_state.num_outputs = n->num_outputs();
 
diff --git a/tensorflow/core/kernels/data/single_threaded_executor_test.cc b/tensorflow/core/kernels/data/single_threaded_executor_test.cc
index 1a5059487a4..898a6555265 100644
--- a/tensorflow/core/kernels/data/single_threaded_executor_test.cc
+++ b/tensorflow/core/kernels/data/single_threaded_executor_test.cc
@@ -58,11 +58,12 @@ class ExecutorTest : public ::testing::Test {
     const int version = graph->versions().producer();
     LocalExecutorParams params;
     params.device = device_.get();
-    params.create_kernel = [this, version](const NodeDef& ndef,
-                                           OpKernel** kernel) {
-      return CreateNonCachedKernel(device_.get(), nullptr, ndef, version,
-                                   kernel);
-    };
+    params.create_kernel =
+        [this, version](const std::shared_ptr<const NodeProperties>& props,
+                        OpKernel** kernel) {
+          return CreateNonCachedKernel(device_.get(), nullptr, props, version,
+                                       kernel);
+        };
     params.delete_kernel = [](OpKernel* kernel) {
       DeleteNonCachedKernel(kernel);
     };
diff --git a/tensorflow/python/eager/pywrap_tfe_test.py b/tensorflow/python/eager/pywrap_tfe_test.py
index f510f24d777..c2389025a25 100644
--- a/tensorflow/python/eager/pywrap_tfe_test.py
+++ b/tensorflow/python/eager/pywrap_tfe_test.py
@@ -237,8 +237,7 @@ class Tests(test.TestCase):
   @test_util.assert_no_garbage_created
   def testInvalidNumOutputs(self):
     with self.assertRaisesRegexp(
-        Exception,
-        "Value for attr 'num_split' of -1 must be at least minimum 1"):
+        Exception, r"Value for number_attr\(\) -1 < 0 \[Op:Split\]"):
       array_ops.split(value=[1, 2, 3], num_or_size_splits=-1)
 
     with self.assertRaisesRegexp(

From fbdfc9db0125d7a0302f69d866a0c1fcb86521d2 Mon Sep 17 00:00:00 2001
From: Jian Li <jianlijianli@google.com>
Date: Tue, 18 Feb 2020 13:07:03 -0800
Subject: [PATCH 163/442] Optimize integer SVDF. The first matmul in integer
 SVDF is not accumulative (because state is symmetrically quantized) so there
 is no need to reset the last bit of state.

PiperOrigin-RevId: 295803702
Change-Id: Ife85c755e52abda33ea1a0ef90f6b219f5301fda
---
 .../lite/kernels/internal/reference/svdf.h    | 19 ++++++-------------
 tensorflow/lite/micro/kernels/svdf.cc         | 19 ++++++-------------
 .../micro/kernels/xtensa-hifimini/svdf.cc     | 19 ++++++-------------
 3 files changed, 18 insertions(+), 39 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/reference/svdf.h b/tensorflow/lite/kernels/internal/reference/svdf.h
index 02a7e8adf0c..7016e3ab053 100644
--- a/tensorflow/lite/kernels/internal/reference/svdf.h
+++ b/tensorflow/lite/kernels/internal/reference/svdf.h
@@ -102,19 +102,6 @@ inline void EvalIntegerSVDF(
   const int n_unit = n_filter / n_rank;
   const int n_memory = weights_time_tensor->dims->data[1];
 
-  // Rewrite last bit of state.
-  // TODO(jianlijianli): move this function into matmul.
-  {
-    for (int b = 0; b < n_batch; ++b) {
-      int16_t* state_ptr_batch =
-          GetTensorData<int16_t>(state_tensor) + b * n_memory * n_filter;
-      for (int c = 0; c < n_filter; ++c) {
-        int16_t* state_ptr = state_ptr_batch + c * n_memory;
-        state_ptr[n_memory - 1] = 0;
-      }
-    }
-  }
-
   // Feature matmul.
   {
     int16_t* state = GetTensorData<int16_t>(state_tensor);
@@ -135,6 +122,12 @@ inline void EvalIntegerSVDF(
         dot_prod =
             MultiplyByQuantizedMultiplier(dot_prod, scale_1_a, scale_1_b);
         dot_prod = std::min(std::max(output_min, dot_prod), output_max);
+        // This assumes state is symmetrically quantized. Otherwise last bit of
+        // state should be initialized to its zero point and accumulate the
+        // dot_prod.
+        // Equivalent as the following:
+        //     result_in_batch = zero point, which happens to be zero.
+        //     result_in_batch += dot_prod.
         *result_in_batch = dot_prod;
         result_in_batch += n_memory;
       }
diff --git a/tensorflow/lite/micro/kernels/svdf.cc b/tensorflow/lite/micro/kernels/svdf.cc
index a4fcd2b7f5e..85f8280d1e1 100644
--- a/tensorflow/lite/micro/kernels/svdf.cc
+++ b/tensorflow/lite/micro/kernels/svdf.cc
@@ -215,19 +215,6 @@ void EvalIntegerSVDF(
   int32_t scratch_tensor[kScratchTensorMaxSize];
   int32_t scratch_output_tensor[kScratchTensorMaxSize];
 
-  // Rewrite last bit of state.
-  {
-    for (int b = 0; b < n_batch; ++b) {
-      int16_t* state_ptr_batch =
-          GetTensorData<int16_t>(activation_state_tensor) +
-          b * n_memory * n_filter;
-      for (int c = 0; c < n_filter; ++c) {
-        int16_t* state_ptr = state_ptr_batch + c * n_memory;
-        state_ptr[n_memory - 1] = 0;
-      }
-    }
-  }
-
   // Feature matmul.
   {
     int16_t* state = GetTensorData<int16_t>(activation_state_tensor);
@@ -248,6 +235,12 @@ void EvalIntegerSVDF(
         dot_prod =
             MultiplyByQuantizedMultiplier(dot_prod, scale_1_a, scale_1_b);
         dot_prod = std::min(std::max(output_min, dot_prod), output_max);
+        // This assumes state is symmetrically quantized. Otherwise last bit of
+        // state should be initialized to its zero point and accumulate the
+        // dot_prod.
+        // Equivalent as the following:
+        //     result_in_batch = zero point, which happens to be zero.
+        //     result_in_batch += dot_prod_56.
         *result_in_batch = dot_prod;
         result_in_batch += n_memory;
       }
diff --git a/tensorflow/lite/micro/kernels/xtensa-hifimini/svdf.cc b/tensorflow/lite/micro/kernels/xtensa-hifimini/svdf.cc
index 80c0c27ea46..d0901e5a2bc 100644
--- a/tensorflow/lite/micro/kernels/xtensa-hifimini/svdf.cc
+++ b/tensorflow/lite/micro/kernels/xtensa-hifimini/svdf.cc
@@ -75,19 +75,6 @@ void EvalIntegerSVDF(
   int32_t scratch_tensor[kScratchTensorMaxSize];
   int32_t scratch_output_tensor[kScratchTensorMaxSize];
 
-  // Rewrite last bit of state.
-  {
-    for (int b = 0; b < n_batch; ++b) {
-      int16_t* state_ptr_batch =
-          GetTensorData<int16_t>(activation_state_tensor) +
-          b * n_memory * n_filter;
-      for (int c = 0; c < n_filter; ++c) {
-        int16_t* state_ptr = state_ptr_batch + c * n_memory;
-        state_ptr[n_memory - 1] = 0;
-      }
-    }
-  }
-
   // Feature matmul.
   {
     int16_t* state = GetTensorData<int16_t>(activation_state_tensor);
@@ -145,6 +132,12 @@ void EvalIntegerSVDF(
         dot_prod_56 = AE_MAXQ56S(dot_prod_56, output_int16_min_56);
         dot_prod_56 = AE_MINQ56S(dot_prod_56, output_int16_max_56);
         // Truncate immediately since the QR register is already 32 bit aligned:
+        // This assumes state is symmetrically quantized. Otherwise last bit of
+        // state should be initialized to its zero point and accumulate the
+        // dot_prod.
+        // Equivalent as the following:
+        //     result_in_batch = zero point, which happens to be zero.
+        //     result_in_batch += dot_prod_56.
         *result_in_batch = AE_TRUNCA32Q48(dot_prod_56);
         result_in_batch += n_memory;
       }

From ecb8befb326e9fb18dbb5556933c16a4165c42cb Mon Sep 17 00:00:00 2001
From: Jakob Buchgraber <buchgr@google.com>
Date: Tue, 18 Feb 2020 13:10:12 -0800
Subject: [PATCH 164/442] nccl_configure: introduce environment variable
 TF_NCCL_CONFIG_REPO

TF_NCCL_CONFIG_REPO follows the same pattern as used in the other *_configure rules. If set TF_NCCL_CONFIG_REPO should point to a package with pregenerated configuration files.

PiperOrigin-RevId: 295804343
Change-Id: Ie1a69732fc3a538ccc3ed158c8ae79bda280514a
---
 third_party/nccl/nccl_configure.bzl | 36 +++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index eba838cd98e..4081ec156d5 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -63,14 +63,7 @@ alias(
 def _label(file):
     return Label("//third_party/nccl:{}".format(file))
 
-def _nccl_configure_impl(repository_ctx):
-    """Implementation of the nccl_configure repository rule."""
-    if (not enable_cuda(repository_ctx) or
-        get_cpu_value(repository_ctx) not in ("Linux", "FreeBSD")):
-        # Add a dummy build file to make bazel query happy.
-        repository_ctx.file("BUILD", _NCCL_DUMMY_BUILD_CONTENT)
-        return
-
+def _create_local_nccl_repository(repository_ctx):
     # Resolve all labels before doing any real work. Resolving causes the
     # function to be restarted with all previous state being lost. This
     # can easily lead to a O(n^2) runtime in the number of labels.
@@ -120,8 +113,33 @@ def _nccl_configure_impl(repository_ctx):
         }
         repository_ctx.template("BUILD", _label("system.BUILD.tpl"), config_wrap)
 
+def _create_remote_nccl_repository(repository_ctx, remote_config_repo):
+    repository_ctx.template(
+        "BUILD",
+        Label(remote_config_repo + ":BUILD"),
+        {},
+    )
+
+    nccl_version = get_host_environ(repository_ctx, _TF_NCCL_VERSION, "")
+    if nccl_version == "":
+        repository_ctx.template(
+            "build_defs.bzl",
+            Label(remote_config_repo + ":build_defs.bzl"),
+            {},
+        )
+
+def _nccl_autoconf_impl(repository_ctx):
+    if (not enable_cuda(repository_ctx) or
+        get_cpu_value(repository_ctx) not in ("Linux", "FreeBSD")):
+        # Add a dummy build file to make bazel query happy.
+        repository_ctx.file("BUILD", _NCCL_DUMMY_BUILD_CONTENT)
+    elif get_host_environ(repository_ctx, "TF_NCCL_CONFIG_REPO") != None:
+        _create_remote_nccl_repository(repository_ctx, get_host_environ(repository_ctx, "TF_NCCL_CONFIG_REPO"))
+    else:
+        _create_local_nccl_repository(repository_ctx)
+
 nccl_configure = repository_rule(
-    implementation = _nccl_configure_impl,
+    implementation = _nccl_autoconf_impl,
     environ = [
         _CUDA_TOOLKIT_PATH,
         _NCCL_HDR_PATH,

From bdcb2782c310f70d5fec8c5085e0e1bf1dbe5c2d Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Tue, 18 Feb 2020 13:10:55 -0800
Subject: [PATCH 165/442] Add dependency for the direct TPU driver back into
 the tpu_driver target

PiperOrigin-RevId: 295804517
Change-Id: I264b897ef17ff38d0c2a98dec1e6de49f8283556
---
 tensorflow/compiler/xla/python/tpu_driver/client/BUILD         | 1 +
 tensorflow/compiler/xla/python/tpu_driver/direct_tpu_driver.cc | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/BUILD b/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
index 5237ce3ab7a..148822f3ba7 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
@@ -22,6 +22,7 @@ cc_library(
         "//tensorflow/compiler/xla/python:local_client",
         "//tensorflow/compiler/xla/python:semaphore",
         "//tensorflow/compiler/xla/python/tpu_driver",
+        "//tensorflow/compiler/xla/python/tpu_driver:direct_tpu_driver",
         "//tensorflow/compiler/xla/python/tpu_driver:grpc_tpu_driver",
         "//tensorflow/compiler/xla/python/tpu_driver:recording_tpu_driver",
         "//tensorflow/compiler/xla/python/tpu_driver:tpu_driver_proto_cc",
diff --git a/tensorflow/compiler/xla/python/tpu_driver/direct_tpu_driver.cc b/tensorflow/compiler/xla/python/tpu_driver/direct_tpu_driver.cc
index 3e4626c5841..76d79786bbf 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/direct_tpu_driver.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/direct_tpu_driver.cc
@@ -27,7 +27,8 @@
 namespace tpu_driver {
 namespace {
 
-// Enable the macro by default in the env where the libtpu.so is available.
+// Enable the macro by default in the Google internal environment where the
+// libtpu.so is linked in statically.
 #ifdef PLATFORM_GOOGLE
 #define TPU_SHARED_LIBRARY_COMPILE_LINK 1
 #endif

From 2c452720ee0595f10a639a24e70975f0ed8f805a Mon Sep 17 00:00:00 2001
From: Lu Wang <luwa@google.com>
Date: Tue, 18 Feb 2020 13:28:42 -0800
Subject: [PATCH 166/442] Remove the minSdk number.

PiperOrigin-RevId: 295808325
Change-Id: Ic396e3d788bcaeae0acc3fdd5d64867e750bba01
---
 tensorflow/lite/java/AndroidManifest.xml                      | 4 +++-
 .../java/src/main/java/org/tensorflow/lite/Interpreter.java   | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/java/AndroidManifest.xml b/tensorflow/lite/java/AndroidManifest.xml
index bacf6d7a126..579021f3b3c 100644
--- a/tensorflow/lite/java/AndroidManifest.xml
+++ b/tensorflow/lite/java/AndroidManifest.xml
@@ -2,8 +2,10 @@
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
     package="org.tensorflow.lite">
 
+    <!-- TFLite Java Library is built against NDK API 19. It may work for Android API levels below
+    19, but is not guaranteed. -->
     <uses-sdk
-        android:minSdkVersion="19" />
+        android:targetSdkVersion="19" />
 
     <application />
 
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index 258d320738b..6aeb06355b4 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -74,7 +74,8 @@ import org.checkerframework.checker.nullness.qual.NonNull;
  * <p><b>WARNING:</b>Instances of a {@code Interpreter} is <b>not</b> thread-safe. A {@code
  * Interpreter} owns resources that <b>must</b> be explicitly freed by invoking {@link #close()}
  *
- * <p>The minimum Android API Level ({@code minSdkVersion}) required for this library is 19.
+ * <p>The TFLite library is built against NDK API 19. It may work for Android API levels below 19,
+ * but is not guaranteed.
  */
 public final class Interpreter implements AutoCloseable {
 

From 8f3272028b674ad08c80ae1e0f31d7ce56f8295e Mon Sep 17 00:00:00 2001
From: Bruce Fontaine <bfontain@google.com>
Date: Tue, 18 Feb 2020 13:35:16 -0800
Subject: [PATCH 167/442] Update static shape detection to be static batch size
 detection for sparse or ragged tensors. This is needed as when they are
 batched by a dataset they will typically have a shape like (batch_size,
 None).

PiperOrigin-RevId: 295809971
Change-Id: I64d2fed27e0766c8857141bc28c581086155f77e
---
 tensorflow/python/distribute/input_lib.py     | 31 +++++++-
 .../python/distribute/input_lib_test.py       | 73 +++++++++++++++++--
 2 files changed, 95 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index aa02323c75e..163f775cc93 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -202,6 +202,30 @@ def _get_next_as_optional(iterator, strategy, name=None):
   return global_has_value, replicas
 
 
+def _is_statically_shaped(tensor_class, shape):
+  """Test if an iteratort output is statically shaped.
+
+  For sparse and ragged tensors this only tests the batch dimension.
+
+  Args:
+    tensor_class: a class from an iterator.output_classes list.
+    shape: a TensorShape from an iterator.output_shapes list.
+
+  Returns:
+    True if the shape is static, false otherwise.
+  """
+  if (tensor_class == sparse_tensor.SparseTensor or
+      isinstance(tensor_class, ragged_tensor.RaggedTensorSpec)):
+    # For sparse or ragged tensor, we should only check the first
+    # dimension in order to get_next_as_optional. This is because
+    # when these tensors get batched by dataset only the batch dimension
+    # is set.
+    if shape.rank > 0 and shape.as_list()[0] is None:
+      return False
+    return True
+  return shape.is_fully_defined()
+
+
 class DistributedIterator(object):
   """Common implementation for all input iterators."""
 
@@ -210,9 +234,10 @@ class DistributedIterator(object):
     for iterator in iterators:
       if not isinstance(iterator, _SingleWorkerDatasetIterator):
         continue
-      flattened_shapes = nest.flatten(iterator.output_shapes)
-      for output_shape in flattened_shapes:
-        if not output_shape.is_fully_defined():
+      flattened = zip(nest.flatten(iterator.output_shapes),
+                      nest.flatten(iterator.output_classes))
+      for output_shape, output_class in flattened:
+        if not _is_statically_shaped(output_class, output_shape):
           static_shape = False
           break
 
diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py
index 3c59d0f5e43..80d5db38403 100644
--- a/tensorflow/python/distribute/input_lib_test.py
+++ b/tensorflow/python/distribute/input_lib_test.py
@@ -525,13 +525,16 @@ class DistributedIteratorTensorTypeTest(DistributedIteratorTestBase,
           ],
           input_type=["dataset", "input_fn"],
           drop_remainder=[False, True],
-          defun=[lambda f: f, def_function.function],
+          defun_type=["lambda", "tf_function"],
       ))
-  def testRaggedSparse(self, distribution, input_type, drop_remainder, defun):
+  def testRaggedSparse(self, distribution, input_type, drop_remainder,
+                       defun_type):
     """Test with `RaggedTensor`s and `SparseTensor`s."""
     if not tf2.enabled():
       self.skipTest("Only V2 is supported.")
 
+    defun = {"lambda": lambda f: f,
+             "tf_function": def_function.function}[defun_type]
     distribution.extended.experimental_enable_get_next_as_optional = True
     global_batch_size = 8
 
@@ -609,14 +612,72 @@ class DistributedIteratorTensorTypeTest(DistributedIteratorTestBase,
         except (StopIteration, errors.OutOfRangeError):
           return sums
 
-    sums = sum_while_loop(
+    while_sums = sum_while_loop(
         iter(dataset),
         defun(lambda state, iterator: _reduce(state, next(iterator))))
-    self.assertDictEqual(sums, defun(sum_for_loop)(dataset))
     self.assertAllEqual(
-        nest.flatten(sums),
+        nest.flatten(while_sums),
         # When there's no partial batch, the sum is smaller.
-        [200. if input_type == "dataset" and drop_remainder else 310.] * 3)
+        [200. if drop_remainder else 310.] * 3)
+    for_sums = defun(sum_for_loop)(dataset)
+    # For loops always call get next as optional inside tf functions, so we
+    # expect 310 here when using an input function (as there are 5 batches of
+    # size 4 round robined over 2 replicas.
+    expected_for_sum = 200.
+    if (not drop_remainder or (
+        defun_type == "tf_function" and input_type == "input_fn")):
+      expected_for_sum = 310.
+    self.assertAllEqual(nest.flatten(for_sums), [expected_for_sum] * 3)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.central_storage_strategy_with_gpu_and_cpu,
+              strategy_combinations.one_device_strategy,
+              strategy_combinations.mirrored_strategy_with_one_cpu
+          ],
+          input_type=["dataset", "input_fn"],
+          drop_remainder=[False, True],
+          tensor_type=["sparse", "ragged"],
+          enable_get_next_as_optional=[True, False]
+      ))
+  def testRaggedSparseGetNextAsOptional(
+      self, distribution, input_type, drop_remainder, tensor_type,
+      enable_get_next_as_optional):
+    """Test with `RaggedTensor`s and `SparseTensor`s."""
+    if not tf2.enabled():
+      self.skipTest("Only V2 is supported.")
+
+    distribution.extended.experimental_enable_get_next_as_optional = (
+        enable_get_next_as_optional)
+    global_batch_size = 8
+
+    def dataset_fn(ctx=None):
+      ctx = ctx or distribute_lib.InputContext()
+      batch_size = ctx.get_per_replica_batch_size(global_batch_size)
+      # Use 20 which isn't divisible by 8 to test partial batch behavior.
+      row_lengths = np.mod(np.arange(20), 4).astype(np.int64)
+      ragged_tensor = ragged_tensor_lib.RaggedTensor.from_row_lengths(
+          np.repeat(np.arange(20, dtype=np.float32), row_lengths), row_lengths)
+      dataset = dataset_ops.DatasetV2.from_tensor_slices({
+          tensor_type: (ragged_tensor if tensor_type == "ragged" else
+                        ragged_tensor.to_sparse()),
+      })
+      dataset = dataset.shard(ctx.num_input_pipelines, ctx.input_pipeline_id)
+      return dataset.batch(batch_size, drop_remainder=drop_remainder)
+
+    if input_type == "dataset":
+      ds = distribution.experimental_distribute_dataset(
+          dataset_fn(distribute_lib.InputContext()))
+    else:
+      ds = distribution.experimental_distribute_datasets_from_function(
+          dataset_fn)
+    iterator = iter(ds)
+
+    self.assertEqual(iterator._enable_get_next_as_optional,
+                     (not drop_remainder) and enable_get_next_as_optional)
 
 
 class DistributedIteratorMultiWorkerTest(

From ac2c05a1d57398653057405018a8c1e51e99756a Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Tue, 18 Feb 2020 13:39:31 -0800
Subject: [PATCH 168/442] [TF/XLA] Fix several layout issues.

1. The previous approach might have different layouts for computation.GetProgramShape() and xla_output_shape. It only used shape_representation_fn for xla_output_shape, but not entry's program shape. These being different are often confusing, and may make it hard to reproduce a bug with HLO dump which doesn't have HloModuleConfig.

2. Output shapes were not updated with layout when there is sharding.

3. The updated value of a resource did not preserve the fast_mem annotation on the argument.

PiperOrigin-RevId: 295811071
Change-Id: I801a46d3039b2349dd0196cbc14ec3d9a8211d55
---
 tensorflow/compiler/tf2xla/type_util.cc       |   1 +
 tensorflow/compiler/tf2xla/xla_compiler.cc    | 213 +++++++++---------
 .../compiler/tf2xla/xla_compiler_test.cc      |   9 +-
 tensorflow/compiler/xla/client/xla_builder.cc |  20 +-
 tensorflow/compiler/xla/client/xla_builder.h  |  12 +-
 5 files changed, 145 insertions(+), 110 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/type_util.cc b/tensorflow/compiler/tf2xla/type_util.cc
index 634f64e01e6..2266a07463d 100644
--- a/tensorflow/compiler/tf2xla/type_util.cc
+++ b/tensorflow/compiler/tf2xla/type_util.cc
@@ -97,6 +97,7 @@ xla::StatusOr<DataType> EncodePrimitiveTypeAsDataType(xla::PrimitiveType type) {
           {xla::U16, DT_UINT16},
           {xla::U32, DT_UINT32},
           {xla::U64, DT_UINT64},
+          {xla::C128, DT_COMPLEX128},
       });
 
   auto it = data_type_map.find(type);
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 8e44d3d4255..3ea62882dcb 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -139,6 +139,86 @@ Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
   return Status::OK();
 }
 
+// Rewrites the layout of xla_shape if there is tiled sharding.
+Status RewriteLayoutWithShardedShape(
+    const absl::optional<xla::HloSharding>& sharding, bool use_fast_memory,
+    XlaCompiler::ShapeRepresentationFn shape_representation_fn,
+    xla::Shape* xla_shape) {
+  if (sharding && !sharding->IsTileMaximal()) {
+    // After sharding, per core shape might have different layout. For example,
+    // before sharding, a shape [128, 128] will be assigned default
+    // minor-to-major {1, 0}. But after we shard this shape to [128, 64] * 2,
+    // the sharded shapes will have minor-to-major {0, 1}.
+    //
+    // As a result, for sharded shapes, we set their layout to per core shape's
+    // layout.
+    //
+    // TODO(endlessroad): for variable input & update, we might have
+    // different layouts which will prevent input output aliasing and
+    // increase memory usage. Investigate such cases.
+    int64 device = *sharding->tile_assignment().begin();
+    std::vector<int64> offset =
+        sharding->TileOffsetForDevice(*xla_shape, device);
+    std::vector<int64> limit = sharding->TileLimitForDevice(*xla_shape, device);
+    std::vector<int64> dimensions(xla_shape->rank());
+    for (int64 i = 0; i < xla_shape->rank(); ++i) {
+      dimensions[i] = limit[i] - offset[i];
+    }
+    xla::Shape per_device_xla_shape =
+        xla::ShapeUtil::MakeShape(xla_shape->element_type(), dimensions);
+    TensorShape per_device_tensor_shape;
+    TF_RETURN_IF_ERROR(
+        XLAShapeToTensorShape(per_device_xla_shape, &per_device_tensor_shape));
+    TF_ASSIGN_OR_RETURN(DataType dtype, EncodePrimitiveTypeAsDataType(
+                                            xla_shape->element_type()));
+    TF_ASSIGN_OR_RETURN(per_device_xla_shape,
+                        shape_representation_fn(per_device_tensor_shape, dtype,
+                                                use_fast_memory));
+    *xla_shape->mutable_layout() = per_device_xla_shape.layout();
+  }
+  return Status::OK();
+}
+
+// There is a shape_representation_fn or sharding for an output, this function
+// uses a reshape to fix the layout.
+xla::StatusOr<xla::XlaOp> ReshapeWithCorrectRepresentationAndSharding(
+    xla::XlaBuilder* builder, xla::XlaOp original, xla::Shape original_shape,
+    XlaCompiler::ShapeRepresentationFn shape_representation_fn,
+    absl::optional<xla::OpSharding> sharding, bool fast_mem) {
+  if (original_shape.IsTuple()) {
+    std::vector<xla::XlaOp> elements;
+    for (int64 i = 0; i < original_shape.tuple_shapes_size(); ++i) {
+      auto subsharding = sharding ? sharding->tuple_shardings(i) : sharding;
+      TF_ASSIGN_OR_RETURN(auto element,
+                          ReshapeWithCorrectRepresentationAndSharding(
+                              builder, xla::GetTupleElement(original, i),
+                              original_shape.tuple_shapes(i),
+                              shape_representation_fn, subsharding, fast_mem));
+      elements.push_back(element);
+    }
+    return xla::Tuple(builder, elements);
+  }
+  if (!original_shape.IsArray()) return original;
+  TensorShape shape;
+  TF_RETURN_IF_ERROR(XLAShapeToTensorShape(original_shape, &shape));
+  TF_ASSIGN_OR_RETURN(DataType dtype, EncodePrimitiveTypeAsDataType(
+                                          original_shape.element_type()));
+  TF_ASSIGN_OR_RETURN(auto to_shape,
+                      shape_representation_fn(shape, dtype, fast_mem));
+  if (sharding) {
+    TF_ASSIGN_OR_RETURN(auto hlo_sharding,
+                        xla::HloSharding::FromProto(*sharding));
+    TF_RETURN_IF_ERROR(RewriteLayoutWithShardedShape(
+        hlo_sharding, fast_mem, shape_representation_fn, &to_shape));
+  }
+  if (xla::ShapeUtil::Compatible(original_shape, to_shape)) {
+    for (int64 i = 0; i < original_shape.rank(); ++i) {
+      to_shape.set_dynamic_dimension(i, original_shape.is_dynamic_dimension(i));
+    }
+  }
+  return xla::Reshape(to_shape, original);
+}
+
 // Builds the XLA computation.
 // - `args` is the list of input arguments
 // - `retvals` is the list of retvals produced by _Retval operators, in index
@@ -188,10 +268,6 @@ Status BuildComputation(
   std::vector<xla::XlaOp> elems;
   elems.reserve(retvals.size());
 
-  // Keeps track of the layout of each retval. If a retval is not in this list,
-  // a descending layout is used. The first element is the output index, second
-  // element is the new layout.
-  std::vector<std::pair<int64, xla::Layout>> retval_index_and_layout;
   // Keeps track of sharding of each retval. If a retval is not in this list,
   // replicate sharding is used. The first element is the output index, second
   // element is the sharding.
@@ -219,22 +295,22 @@ Status BuildComputation(
         TF_ASSIGN_OR_RETURN(output.shape, retval.GetShape());
         xla::XlaOp value = retval.handle();
         auto it = retval_shardings.find(i);
-        xla::XlaScopedShardingAssignment assign_sharding(
-            builder, it == retval_shardings.end()
-                         ? absl::optional<xla::OpSharding>()
-                         : it->second);
+        absl::optional<xla::OpSharding> sharding =
+            it == retval_shardings.end() ? absl::optional<xla::OpSharding>()
+                                         : it->second;
         if (it != retval_shardings.end()) {
           retval_index_and_sharding[elems.size()] = it->second;
         }
         if (shape_representation_fn) {
-          // If there is a shape representation function, reshape the output
-          // tensor to the shape given by the representation shape function.
-          TF_ASSIGN_OR_RETURN(xla::Shape shape, shape_representation_fn(
-                                                    output.shape, output.type,
-                                                    /*use_fast_memory=*/false));
-          value = xla::Reshape(value, xla::AsInt64Slice(shape.dimensions()));
-          retval_index_and_layout.emplace_back(elems.size(), shape.layout());
-        } else if (it != retval_shardings.end()) {
+          TF_ASSIGN_OR_RETURN(auto original_shape, builder->GetShape(value));
+          TF_ASSIGN_OR_RETURN(value,
+                              ReshapeWithCorrectRepresentationAndSharding(
+                                  builder, value, original_shape,
+                                  shape_representation_fn, sharding,
+                                  /*fast_mem=*/false));
+        }
+        if (it != retval_shardings.end()) {
+          xla::XlaScopedShardingAssignment assign_sharding(builder, sharding);
           // Apply the sharding to the output, if there is a core assignment.
           value = identity_op(value);
         }
@@ -312,43 +388,27 @@ Status BuildComputation(
         update.tensor_array_gradients_accessed.insert(grad.first);
       }
 
+      xla::XlaOp handle;
+      TF_RETURN_IF_ERROR(resource->Pack(&handle, builder));
+      auto sharding = it == arg_shardings.end()
+                          ? absl::optional<xla::OpSharding>()
+                          : it->second;
+      // Set layout of the retval to device representation layout.
+      if (shape_representation_fn) {
+        TF_ASSIGN_OR_RETURN(auto original_shape, builder->GetShape(handle));
+        TF_ASSIGN_OR_RETURN(
+            handle, ReshapeWithCorrectRepresentationAndSharding(
+                        builder, handle, original_shape,
+                        shape_representation_fn, sharding, arg.fast_mem));
+      }
+
       // Request that the value be returned on a specific core.
-      xla::XlaScopedShardingAssignment assign_sharding(
-          builder, it == arg_shardings.end() ? absl::optional<xla::OpSharding>()
-                                             : it->second);
+      xla::XlaScopedShardingAssignment assign_sharding(builder, sharding);
       if (it != arg_shardings.end()) {
         retval_index_and_sharding[elems.size()] = it->second;
       }
-
-      xla::XlaOp handle;
-      TF_RETURN_IF_ERROR(resource->Pack(&handle, builder));
-
       // Ensures the correct sharding is applied to the output.
       handle = identity_op(handle);
-
-      // Set layout of the retval to device representation layout.
-      absl::optional<xla::Shape> representation_shape;
-      if (shape_representation_fn) {
-        TF_ASSIGN_OR_RETURN(
-            xla::Shape xla_shape,
-            shape_representation_fn(resource->shape(), resource->type(),
-                                    /*use_fast_memory=*/false));
-        representation_shape = xla_shape;
-      }
-      if (resource->representation_shape().has_value()) {
-        const xla::Shape& xla_shape = resource->representation_shape().value();
-        if (representation_shape) {
-          TF_RET_CHECK(
-              xla::ShapeUtil::Compatible(*representation_shape, xla_shape));
-        } else {
-          representation_shape = xla_shape;
-        }
-      }
-      if (representation_shape) {
-        retval_index_and_layout.emplace_back(elems.size(),
-                                             representation_shape->layout());
-      }
-
       elems.push_back(handle);
     }
   }
@@ -411,20 +471,8 @@ Status BuildComputation(
   }
   *computation = computation_status.ConsumeValueOrDie();
 
-  TF_ASSIGN_OR_RETURN(const auto& program_shape,
-                      computation->GetProgramShape());
+  TF_ASSIGN_OR_RETURN(auto program_shape, computation->GetProgramShape());
   *output_shape = program_shape.result();
-  // Update the output layout to the layout of retval.
-  for (auto& index_and_layout : retval_index_and_layout) {
-    if (!always_return_tuple && elems.size() == 1) {
-      *output_shape->mutable_layout() = index_and_layout.second;
-      continue;
-    }
-
-    xla::Shape* output_sub_shape = xla::ShapeUtil::GetMutableSubshape(
-        output_shape, {index_and_layout.first});
-    *output_sub_shape->mutable_layout() = index_and_layout.second;
-  }
   return Status::OK();
 }
 
@@ -779,47 +827,6 @@ Status XlaCompiler::XLAShapeForArgument(
     const XlaCompiler::Argument& arg, bool is_entry_computation,
     const absl::optional<xla::HloSharding>& arg_sharding,
     xla::Shape* xla_shape) const {
-  auto rewrite_layout_with_sharded_shape =
-      [](const absl::optional<xla::HloSharding>& arg_sharding,
-         bool use_fast_memory,
-         XlaCompiler::ShapeRepresentationFn shape_representation_fn,
-         xla::Shape* xla_shape) {
-        if (arg_sharding && !arg_sharding->IsTileMaximal()) {
-          // After parameter sharding, per core parameter might have different
-          // layout. For example, before sharding, a parameter of shape [128,
-          // 128] will be assigned default minor-to-major {1, 0}. But after we
-          // shard this parameter to [128, 64] * 2, the sharded parameters
-          // will have minor-to-major {0, 1}.
-          //
-          // As a result, for sharded parameters, we set their layout to per
-          // core parameter's layout.
-          //
-          // TODO(endlessroad): for variable input & update, we might have
-          // different layouts which will prevent input output aliasing and
-          // increase memory usage. Investigate such cases.
-          int64 device = *arg_sharding->tile_assignment().begin();
-          std::vector<int64> offset =
-              arg_sharding->TileOffsetForDevice(*xla_shape, device);
-          std::vector<int64> limit =
-              arg_sharding->TileLimitForDevice(*xla_shape, device);
-          std::vector<int64> dimensions(xla_shape->rank());
-          for (int64 i = 0; i < xla_shape->rank(); ++i) {
-            dimensions[i] = limit[i] - offset[i];
-          }
-          xla::Shape per_device_xla_shape =
-              xla::ShapeUtil::MakeShape(xla_shape->element_type(), dimensions);
-          TensorShape per_device_tensor_shape;
-          TF_RETURN_IF_ERROR(XLAShapeToTensorShape(per_device_xla_shape,
-                                                   &per_device_tensor_shape));
-          TF_ASSIGN_OR_RETURN(DataType dtype, EncodePrimitiveTypeAsDataType(
-                                                  xla_shape->element_type()));
-          TF_ASSIGN_OR_RETURN(per_device_xla_shape,
-                              shape_representation_fn(per_device_tensor_shape,
-                                                      dtype, use_fast_memory));
-          *xla_shape->mutable_layout() = per_device_xla_shape.layout();
-        }
-        return Status::OK();
-      };
   switch (arg.kind) {
     case XlaCompiler::Argument::kConstant:
       LOG(FATAL) << "Unreachable case";
@@ -835,7 +842,7 @@ Status XlaCompiler::XLAShapeForArgument(
         TF_ASSIGN_OR_RETURN(*xla_shape, options_.shape_representation_fn(
                                             shape, arg.type,
                                             /*use_fast_memory=*/false));
-        TF_RETURN_IF_ERROR(rewrite_layout_with_sharded_shape(
+        TF_RETURN_IF_ERROR(RewriteLayoutWithShardedShape(
             arg_sharding, /*use_fast_memory=*/false,
             options_.shape_representation_fn, xla_shape));
       } else {
@@ -863,7 +870,7 @@ Status XlaCompiler::XLAShapeForArgument(
                               options_.shape_representation_fn(
                                   absl::get<TensorShape>(arg.shape), arg.type,
                                   /*use_fast_memory=*/arg.fast_mem));
-          TF_RETURN_IF_ERROR(rewrite_layout_with_sharded_shape(
+          TF_RETURN_IF_ERROR(RewriteLayoutWithShardedShape(
               arg_sharding, arg.fast_mem, options_.shape_representation_fn,
               xla_shape));
           return Status::OK();
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index cf8bd6b6ce4..76780167187 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -365,7 +365,8 @@ TEST_F(XlaCompilerTest, HonorShapeRepresentationFnForFastMemVar) {
   compile_options.return_updated_values_for_all_resources = true;
   TF_ASSERT_OK(compiler.CompileGraph(compile_options, "add", std::move(graph),
                                      args, &result));
-  EXPECT_EQ(fast_mem_arg_count, 1);
+  // Count 2: one for argument, one for the return value.
+  EXPECT_EQ(fast_mem_arg_count, 2);
 }
 
 // Tests that the compiler can correctly propagate the layout assigned by
@@ -417,6 +418,8 @@ TEST_F(XlaCompilerTest, HonorShapeRepresentationFnForRetVal) {
   // Check that the return shapes are correctly tranposed.
   EXPECT_EQ(result.xla_output_shape,
             xla::ShapeUtil::MakeTupleShape({transposed, transposed}));
+  EXPECT_EQ(result.computation->GetProgramShape().ConsumeValueOrDie().result(),
+            xla::ShapeUtil::MakeTupleShape({transposed, transposed}));
 }
 
 // The layout of resource variable shouldn't change after transpose
@@ -1091,6 +1094,8 @@ TEST_F(XlaCompilerTest, ResultLayoutSingle) {
   EXPECT_TRUE(xla::ShapeUtil::Equal(
       result.xla_output_shape,
       xla::ShapeUtil::MakeShapeWithLayout(xla::S32, {2, 3}, {0, 1})));
+  EXPECT_EQ(result.computation->GetProgramShape().ConsumeValueOrDie().result(),
+            result.xla_output_shape);
 }
 
 TEST_F(XlaCompilerTest, ResultLayoutMultiple) {
@@ -1131,6 +1136,8 @@ TEST_F(XlaCompilerTest, ResultLayoutMultiple) {
   EXPECT_TRUE(xla::ShapeUtil::Equal(
       result.xla_output_shape,
       xla::ShapeUtil::MakeTupleShape({result_shape, result_shape})));
+  EXPECT_EQ(result.computation->GetProgramShape().ConsumeValueOrDie().result(),
+            result.xla_output_shape);
 }
 
 // Tests a simple graph that reads and writes a variable.
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index a7e761b7dd0..d4a267d4356 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -528,7 +528,8 @@ StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(const Shape& output_shape,
   }
 
   // Eliminate the size one dimensions.
-  TF_ASSIGN_OR_RETURN(XlaOp reshaped_operand, Reshape(reshaped_shape, operand));
+  TF_ASSIGN_OR_RETURN(XlaOp reshaped_operand,
+                      ReshapeInternal(reshaped_shape, operand));
   // Broadcast 'reshape' up to the larger size.
   return InDimBroadcast(broadcast_shape, reshaped_operand,
                         broadcast_dimensions);
@@ -828,8 +829,8 @@ XlaOp XlaBuilder::BroadcastInDim(
   });
 }
 
-StatusOr<XlaOp> XlaBuilder::Reshape(const Shape& shape, XlaOp operand,
-                                    int64 inferred_dimension) {
+StatusOr<XlaOp> XlaBuilder::ReshapeInternal(const Shape& shape, XlaOp operand,
+                                            int64 inferred_dimension) {
   TF_RETURN_IF_ERROR(first_error_);
 
   HloInstructionProto instr;
@@ -1020,7 +1021,7 @@ XlaOp XlaBuilder::Reshape(XlaOp operand, absl::Span<const int64> dimensions,
     XlaOp transposed = IsIdentityPermutation(dimensions)
                            ? operand
                            : Transpose(operand, dimensions);
-    return Reshape(shape, transposed, inferred_dimension);
+    return ReshapeInternal(shape, transposed, inferred_dimension);
   });
 }
 
@@ -1034,6 +1035,13 @@ XlaOp XlaBuilder::Reshape(XlaOp operand, absl::Span<const int64> new_sizes,
   });
 }
 
+XlaOp XlaBuilder::Reshape(const Shape& shape, XlaOp operand,
+                          int64 inferred_dimension) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    return ReshapeInternal(shape, operand, inferred_dimension);
+  });
+}
+
 XlaOp XlaBuilder::Collapse(XlaOp operand, absl::Span<const int64> dimensions) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     if (dimensions.size() <= 1) {
@@ -2951,6 +2959,10 @@ XlaOp Reshape(const XlaOp operand, absl::Span<const int64> new_sizes) {
   return operand.builder()->Reshape(operand, new_sizes);
 }
 
+XlaOp Reshape(const Shape& shape, XlaOp operand) {
+  return operand.builder()->Reshape(shape, operand);
+}
+
 XlaOp ReshapeWithInferredDimension(XlaOp operand,
                                    absl::Span<const int64> new_sizes,
                                    int64 inferred_dimension) {
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 993394ea275..6ec9aeb809f 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -397,6 +397,9 @@ class XlaBuilder {
   XlaOp Reshape(XlaOp operand, absl::Span<const int64> new_sizes,
                 int64 inferred_dimension = -1);
 
+  XlaOp Reshape(const Shape& shape, XlaOp operand,
+                int64 inferred_dimension = -1);
+
   XlaOp Collapse(XlaOp operand, absl::Span<const int64> dimensions);
 
   XlaOp Slice(XlaOp operand, absl::Span<const int64> start_indices,
@@ -668,8 +671,8 @@ class XlaBuilder {
 
   // Internal helper method for creating a Reshape op with the already inferred
   // shape.
-  StatusOr<XlaOp> Reshape(const Shape& shape, XlaOp operand,
-                          int64 inferred_dimension = -1);
+  StatusOr<XlaOp> ReshapeInternal(const Shape& shape, XlaOp operand,
+                                  int64 inferred_dimension = -1);
 
   // Returns the (inferred) result for the program shape using the given root.
   StatusOr<ProgramShape> GetProgramShape(int64 root_id) const;
@@ -777,6 +780,8 @@ class XlaBuilder {
 
   friend XlaOp Reshape(XlaOp operand, absl::Span<const int64> new_sizes);
 
+  friend XlaOp Reshape(const Shape& shape, XlaOp operand);
+
   friend XlaOp ReshapeWithInferredDimension(XlaOp operand,
                                             absl::Span<const int64> new_sizes,
                                             int64 inferred_dimension);
@@ -1252,6 +1257,9 @@ XlaOp Reshape(XlaOp operand, absl::Span<const int64> dimensions,
 // sizes. Conceptually, this is a limited form of "shape casting".
 XlaOp Reshape(XlaOp operand, absl::Span<const int64> new_sizes);
 
+// Enqueues a Reshape op that uses an explicit target shape.
+XlaOp Reshape(const Shape& shape, XlaOp operand);
+
 // `inferred_dimension` represents the output dimension that's inferred by
 // upper-level framework by dividing the input element count by the known
 // output element count. While an inferred_dimension can be static, if there

From 0488a18af4ba1f630d06b685a301f6d94622aad4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 13:41:56 -0800
Subject: [PATCH 169/442] Automated rollback of commit
 80acd88cd43f09a1a2980792e3955f2ce5147bfd

PiperOrigin-RevId: 295811620
Change-Id: I39a1f7f7dadee2c7ea231df16da1ab5516c8f1fa
---
 .../core/kernels/fused_batch_norm_op.cc       | 290 ++++++++++++------
 .../core/kernels/fused_batch_norm_op_test.cc  |  63 ++++
 tensorflow/core/ops/nn_ops.cc                 |   4 +
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |   6 +-
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |   6 +-
 5 files changed, 264 insertions(+), 105 deletions(-)

diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index cc0ce9b7922..afe3e621fcf 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -81,7 +81,7 @@ Status ParseActivationMode(OpKernelConstruction* context,
 }
 
 // Functor used by FusedBatchNormOp to do the computations.
-template <typename Device, typename T, typename U>
+template <typename Device, typename T, typename U, bool is_training>
 struct FusedBatchNorm;
 // Functor used by FusedBatchNormGradOp to do the computations when
 // is_training=True.
@@ -89,17 +89,155 @@ template <typename Device, typename T, typename U>
 struct FusedBatchNormGrad;
 
 template <typename T, typename U>
-struct FusedBatchNorm<CPUDevice, T, U> {
+struct FusedBatchNorm<CPUDevice, T, U, /* is_training= */ true> {
+  void operator()(OpKernelContext* context, const Tensor& x_input,
+                  const Tensor& scale_input, const Tensor& offset_input,
+                  const Tensor& running_mean_input,
+                  const Tensor& running_variance_input,
+                  const Tensor* side_input, U epsilon, U exponential_avg_factor,
+                  FusedBatchNormActivationMode activation_mode,
+                  Tensor* y_output, Tensor* running_mean_output,
+                  Tensor* running_var_output, Tensor* saved_batch_mean_output,
+                  Tensor* saved_batch_var_output, TensorFormat tensor_format,
+                  bool use_reserved_space) {
+    OP_REQUIRES(context, side_input == nullptr,
+                errors::Internal(
+                    "The CPU implementation of FusedBatchNorm does not support "
+                    "side input."));
+    OP_REQUIRES(context,
+                activation_mode == FusedBatchNormActivationMode::kIdentity,
+                errors::Internal("The CPU implementation of FusedBatchNorm "
+                                 "does not support activations."));
+
+    if (use_reserved_space) {
+      Tensor* dummy_reserve_space = nullptr;
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(5, {}, &dummy_reserve_space));
+      // Initialize the memory, to avoid sanitizer alerts.
+      dummy_reserve_space->flat<U>()(0) = U();
+    }
+    Tensor transformed_x;
+    Tensor transformed_y;
+    if (tensor_format == FORMAT_NCHW) {
+      const int64 in_batch = GetTensorDim(x_input, tensor_format, 'N');
+      const int64 in_rows = GetTensorDim(x_input, tensor_format, 'H');
+      const int64 in_cols = GetTensorDim(x_input, tensor_format, 'W');
+      const int64 in_depths = GetTensorDim(x_input, tensor_format, 'C');
+      OP_REQUIRES_OK(context, context->allocate_temp(
+                                  DataTypeToEnum<T>::value,
+                                  ShapeFromFormat(FORMAT_NHWC, in_batch,
+                                                  in_rows, in_cols, in_depths),
+                                  &transformed_x));
+      OP_REQUIRES_OK(context, context->allocate_temp(
+                                  DataTypeToEnum<T>::value,
+                                  ShapeFromFormat(FORMAT_NHWC, in_batch,
+                                                  in_rows, in_cols, in_depths),
+                                  &transformed_y));
+      // Perform NCHW to NHWC
+      std::vector<int32> perm = {0, 2, 3, 1};
+      OP_REQUIRES_OK(
+          context, ::tensorflow::DoTranspose(context->eigen_device<CPUDevice>(),
+                                             x_input, perm, &transformed_x));
+    } else {
+      transformed_x = x_input;
+      transformed_y = *y_output;
+    }
+    typename TTypes<T, 4>::Tensor x(transformed_x.tensor<T, 4>());
+    typename TTypes<U>::ConstVec scale(scale_input.vec<U>());
+    typename TTypes<U>::ConstVec offset(offset_input.vec<U>());
+    typename TTypes<U>::ConstVec old_mean(running_mean_input.vec<U>());
+    typename TTypes<U>::ConstVec old_variance(running_variance_input.vec<U>());
+    typename TTypes<T, 4>::Tensor y(transformed_y.tensor<T, 4>());
+    typename TTypes<U>::Vec new_mean(running_mean_output->vec<U>());
+    typename TTypes<U>::Vec new_variance(running_var_output->vec<U>());
+    typename TTypes<U>::Vec saved_batch_mean(saved_batch_mean_output->vec<U>());
+    typename TTypes<U>::Vec saved_batch_var(saved_batch_var_output->vec<U>());
+
+    const CPUDevice& d = context->eigen_device<CPUDevice>();
+
+    const int depth = x.dimension(3);
+    const int size = x.size();
+    const int rest_size = size / depth;
+    Eigen::DSizes<Eigen::Index, 2> rest_by_depth(rest_size, depth);
+
+#if !defined(EIGEN_HAS_INDEX_LIST)
+    Eigen::DSizes<Eigen::Index, 2> one_by_depth(1, depth);
+    Eigen::array<int, 1> reduce_dims({0});
+    Eigen::array<int, 2> bcast_spec({rest_size, 1});
+#else
+    Eigen::IndexList<Eigen::type2index<1>, Eigen::Index> one_by_depth;
+    one_by_depth.set(1, depth);
+    Eigen::IndexList<Eigen::type2index<0>> reduce_dims;
+    Eigen::IndexList<Eigen::Index, Eigen::type2index<1>> bcast_spec;
+    bcast_spec.set(0, rest_size);
+#endif
+
+    auto x_rest_by_depth = x.reshape(rest_by_depth).template cast<U>();
+    const int rest_size_minus_one = (rest_size > 1) ? (rest_size - 1) : 1;
+    U rest_size_inv = static_cast<U>(1.0f / static_cast<U>(rest_size));
+    // This adjustment is for Bessel's correction
+    U rest_size_adjust =
+        static_cast<U>(rest_size) / static_cast<U>(rest_size_minus_one);
+
+    Eigen::Tensor<U, 1, Eigen::RowMajor> batch_mean(depth);
+    Eigen::Tensor<U, 1, Eigen::RowMajor> batch_variance(depth);
+
+    batch_mean.device(d) = (x_rest_by_depth.sum(reduce_dims) * rest_size_inv);
+    auto x_centered = x_rest_by_depth -
+                      batch_mean.reshape(one_by_depth).broadcast(bcast_spec);
+
+    batch_variance.device(d) =
+        x_centered.square().sum(reduce_dims) * rest_size_inv;
+    auto scaling_factor = ((batch_variance + epsilon).rsqrt() * scale)
+                              .eval()
+                              .reshape(one_by_depth)
+                              .broadcast(bcast_spec);
+    auto x_scaled = x_centered * scaling_factor;
+    auto x_shifted =
+        (x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec))
+            .template cast<T>();
+
+    y.reshape(rest_by_depth).device(d) = x_shifted;
+    if (exponential_avg_factor == U(1.0)) {
+      saved_batch_var.device(d) = batch_variance;
+      saved_batch_mean.device(d) = batch_mean;
+      new_variance.device(d) = batch_variance * rest_size_adjust;
+      new_mean.device(d) = batch_mean;
+    } else {
+      U one_minus_factor = U(1) - exponential_avg_factor;
+      saved_batch_var.device(d) = batch_variance;
+      saved_batch_mean.device(d) = batch_mean;
+      new_variance.device(d) =
+          one_minus_factor * old_variance +
+          (exponential_avg_factor * rest_size_adjust) * batch_variance;
+      new_mean.device(d) =
+          one_minus_factor * old_mean + exponential_avg_factor * batch_mean;
+    }
+
+    if (tensor_format == FORMAT_NCHW) {
+      // Perform NHWC to NCHW
+      const std::vector<int32> perm = {0, 3, 1, 2};
+      const Status s = ::tensorflow::DoTranspose(
+          context->eigen_device<CPUDevice>(), transformed_y, perm, y_output);
+      if (!s.ok()) {
+        context->SetStatus(errors::InvalidArgument("Transpose failed: ", s));
+      }
+    }
+  }
+};
+
+template <typename T, typename U>
+struct FusedBatchNorm<CPUDevice, T, U, /* is_training= */ false> {
   void operator()(OpKernelContext* context, const Tensor& x_input,
                   const Tensor& scale_input, const Tensor& offset_input,
                   const Tensor& estimated_mean_input,
                   const Tensor& estimated_variance_input,
-                  const Tensor* side_input, U epsilon,
+                  const Tensor* side_input, U epsilon, U exponential_avg_factor,
                   FusedBatchNormActivationMode activation_mode,
                   Tensor* y_output, Tensor* batch_mean_output,
                   Tensor* batch_var_output, Tensor* saved_mean_output,
                   Tensor* saved_var_output, TensorFormat tensor_format,
-                  bool use_reserved_space, bool is_training) {
+                  bool use_reserved_space) {
     OP_REQUIRES(context, side_input == nullptr,
                 errors::Internal(
                     "The CPU implementation of FusedBatchNorm does not support "
@@ -150,9 +288,7 @@ struct FusedBatchNorm<CPUDevice, T, U> {
         estimated_variance_input.vec<U>());
     typename TTypes<T, 4>::Tensor y(transformed_y.tensor<T, 4>());
     typename TTypes<U>::Vec batch_mean(batch_mean_output->vec<U>());
-    typename TTypes<U>::Vec batch_var(batch_var_output->vec<U>());
-    typename TTypes<U>::Vec saved_mean(saved_mean_output->vec<U>());
-    typename TTypes<U>::Vec saved_var(saved_var_output->vec<U>());
+    typename TTypes<U>::Vec batch_variance(batch_var_output->vec<U>());
 
     const CPUDevice& d = context->eigen_device<CPUDevice>();
 
@@ -168,80 +304,36 @@ struct FusedBatchNorm<CPUDevice, T, U> {
 #else
     Eigen::IndexList<Eigen::type2index<1>, Eigen::Index> one_by_depth;
     one_by_depth.set(1, depth);
-    Eigen::IndexList<Eigen::type2index<0>> reduce_dims;
     Eigen::IndexList<Eigen::Index, Eigen::type2index<1>> bcast_spec;
     bcast_spec.set(0, rest_size);
 #endif
 
     auto x_rest_by_depth = x.reshape(rest_by_depth).template cast<U>();
-    const int rest_size_minus_one = (rest_size > 1) ? (rest_size - 1) : 1;
-    U rest_size_inv = static_cast<U>(1.0f / static_cast<U>(rest_size));
-    // This adjustment is for Bessel's correction
-    U rest_size_adjust =
-        static_cast<U>(rest_size) / static_cast<U>(rest_size_minus_one);
+    auto x_centered =
+        x_rest_by_depth -
+        estimated_mean.reshape(one_by_depth).broadcast(bcast_spec);
+    auto scaling_factor = ((estimated_variance + epsilon).rsqrt() * scale)
+                              .eval()
+                              .reshape(one_by_depth)
+                              .broadcast(bcast_spec);
+    auto x_scaled = x_centered * scaling_factor;
+    auto x_shifted =
+        (x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec))
+            .template cast<T>();
 
-    Eigen::Tensor<U, 1, Eigen::RowMajor> mean(depth);
-    Eigen::Tensor<U, 1, Eigen::RowMajor> variance(depth);
-    BlockingCounter barrier(1);
-    std::atomic<uint8> task_counter;
-    auto on_done = [&]() {
-      uint8 count = --task_counter;
-      if (count == 0) {
-        if (tensor_format == FORMAT_NCHW) {
-          // Perform NHWC to NCHW
-          const std::vector<int32> perm = {0, 3, 1, 2};
-          const Status s =
-              ::tensorflow::DoTranspose(context->eigen_device<CPUDevice>(),
-                                        transformed_y, perm, y_output);
-          if (!s.ok()) {
-            context->SetStatus(
-                errors::InvalidArgument("Transpose failed: ", s));
-          }
-        }
-        barrier.DecrementCount();
+    y.reshape(rest_by_depth).device(d) = x_shifted;
+    batch_mean.device(d) = estimated_mean;
+    batch_variance.device(d) = estimated_variance;
+
+    if (tensor_format == FORMAT_NCHW) {
+      // Perform NHWC to NCHW
+      const std::vector<int32> perm = {0, 3, 1, 2};
+      const Status s = ::tensorflow::DoTranspose(
+          context->eigen_device<CPUDevice>(), transformed_y, perm, y_output);
+      if (!s.ok()) {
+        context->SetStatus(errors::InvalidArgument("Transpose failed: ", s));
       }
-    };
-    if (is_training) {
-      // TODO(b/137108598): Extend kernel to allow use of exponential averaging.
-      mean.device(d) = (x_rest_by_depth.sum(reduce_dims) * rest_size_inv);
-      auto x_centered =
-          x_rest_by_depth - mean.reshape(one_by_depth).broadcast(bcast_spec);
-
-      variance.device(d) = x_centered.square().sum(reduce_dims) * rest_size_inv;
-      auto scaling_factor = ((variance + epsilon).rsqrt() * scale)
-                                .eval()
-                                .reshape(one_by_depth)
-                                .broadcast(bcast_spec);
-      auto x_scaled = x_centered * scaling_factor;
-      auto x_shifted =
-          (x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec))
-              .template cast<T>();
-
-      task_counter = 5;
-      y.reshape(rest_by_depth).device(d, on_done) = x_shifted;
-      batch_var.device(d, on_done) = variance * rest_size_adjust;
-      saved_var.device(d, on_done) = variance;
-      batch_mean.device(d, on_done) = mean;
-      saved_mean.device(d, on_done) = mean;
-    } else {  // is_training == false
-      auto x_centered =
-          x_rest_by_depth -
-          estimated_mean.reshape(one_by_depth).broadcast(bcast_spec);
-      auto scaling_factor = ((estimated_variance + epsilon).rsqrt() * scale)
-                                .eval()
-                                .reshape(one_by_depth)
-                                .broadcast(bcast_spec);
-      auto x_scaled = x_centered * scaling_factor;
-      auto x_shifted =
-          (x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec))
-              .template cast<T>();
-
-      task_counter = 3;
-      y.reshape(rest_by_depth).device(d, on_done) = x_shifted;
-      mean.device(d, on_done) = estimated_mean;
-      variance.device(d, on_done) = estimated_variance;
     }
-    barrier.Wait();
   }
 };
 
@@ -662,17 +754,17 @@ class CudnnBatchNormAllocatorInOutput : public ScratchAllocator {
   bool output_allocated = false;
 };
 
-template <typename T, typename U>
-struct FusedBatchNorm<GPUDevice, T, U> {
+template <typename T, typename U, bool is_training>
+struct FusedBatchNorm<GPUDevice, T, U, is_training> {
   void operator()(OpKernelContext* context, const Tensor& x,
                   const Tensor& scale, const Tensor& offset,
                   const Tensor& estimated_mean,
                   const Tensor& estimated_variance, const Tensor* side_input,
-                  U epsilon, FusedBatchNormActivationMode activation_mode,
-                  Tensor* y, Tensor* batch_mean, Tensor* batch_var,
-                  Tensor* saved_mean, Tensor* saved_inv_var,
-                  TensorFormat tensor_format, bool use_reserved_space,
-                  bool is_training) {
+                  U epsilon, U exponential_avg_factor,
+                  FusedBatchNormActivationMode activation_mode, Tensor* y,
+                  Tensor* batch_mean, Tensor* batch_var, Tensor* saved_mean,
+                  Tensor* saved_inv_var, TensorFormat tensor_format,
+                  bool use_reserved_space) {
     auto* stream = context->op_device_context()->stream();
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available"));
 
@@ -837,15 +929,13 @@ struct FusedBatchNorm<GPUDevice, T, U> {
       workspace_allocator.reset(
           new functor::CudnnBatchNormAllocatorInTemp<uint8>(context));
     }
-    // TODO(b/137108598): Extend kernel to allow use of exponential averaging.
-    const double exponential_average_factor = 1.0;
     bool cudnn_launch_status =
         stream
             ->ThenBatchNormalizationForward(
                 x_ptr, scale_ptr, offset_ptr, estimated_mean_ptr,
                 estimated_variance_ptr, side_input_ptr, x_desc,
                 scale_offset_desc, static_cast<double>(epsilon),
-                exponential_average_factor,
+                static_cast<double>(exponential_avg_factor),
                 AsDnnActivationMode(activation_mode), &y_ptr, &batch_mean_ptr,
                 &batch_var_ptr, &saved_mean_ptr, &saved_inv_var_ptr,
                 is_training, reserve_space_allocator.get(),
@@ -1075,6 +1165,10 @@ class FusedBatchNormOpBase : public OpKernel {
     float epsilon;
     OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
     epsilon_ = U(epsilon);
+    float exponential_avg_factor;
+    OP_REQUIRES_OK(context, context->GetAttr("exponential_avg_factor",
+                                             &exponential_avg_factor));
+    exponential_avg_factor_ = U(exponential_avg_factor);
     string tensor_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format));
     OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_),
@@ -1165,17 +1259,6 @@ class FusedBatchNormOpBase : public OpKernel {
                                   "channel dimension to be a multiple of 4."));
     }
 
-    if (is_training_) {
-      OP_REQUIRES(
-          context, estimated_mean.dim_size(0) == 0,
-          errors::InvalidArgument("estimated_mean must be empty for training",
-                                  estimated_mean.shape().DebugString()));
-      OP_REQUIRES(context, estimated_variance.dim_size(0) == 0,
-                  errors::InvalidArgument(
-                      "estimated_variance must be empty for training",
-                      estimated_variance.shape().DebugString()));
-    }
-
     Tensor* y = nullptr;
     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
                                 {0}, 0, x.shape(), &y));
@@ -1192,15 +1275,24 @@ class FusedBatchNormOpBase : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_output(4, scale.shape(),
                                                      &saved_maybe_inv_var));
 
-    functor::FusedBatchNorm<Device, T, U>()(
-        context, x, scale, offset, estimated_mean, estimated_variance,
-        side_input, epsilon_, activation_mode_, y, batch_mean, batch_var,
-        saved_mean, saved_maybe_inv_var, tensor_format_, use_reserved_space,
-        is_training_);
+    if (is_training_) {
+      functor::FusedBatchNorm<Device, T, U, true>()(
+          context, x, scale, offset, estimated_mean, estimated_variance,
+          side_input, epsilon_, exponential_avg_factor_, activation_mode_, y,
+          batch_mean, batch_var, saved_mean, saved_maybe_inv_var,
+          tensor_format_, use_reserved_space);
+    } else {
+      functor::FusedBatchNorm<Device, T, U, false>()(
+          context, x, scale, offset, estimated_mean, estimated_variance,
+          side_input, epsilon_, exponential_avg_factor_, activation_mode_, y,
+          batch_mean, batch_var, saved_mean, saved_maybe_inv_var,
+          tensor_format_, use_reserved_space);
+    }
   }
 
  private:
   U epsilon_;
+  U exponential_avg_factor_;
   TensorFormat tensor_format_;
   bool is_training_;
   bool has_side_input_;
diff --git a/tensorflow/core/kernels/fused_batch_norm_op_test.cc b/tensorflow/core/kernels/fused_batch_norm_op_test.cc
index 7da57143b77..734fb294135 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op_test.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op_test.cc
@@ -40,6 +40,7 @@ TEST_F(FusedBatchNormOpTest, Training) {
                    .Input(FakeInput(DT_FLOAT))
                    .Input(FakeInput(DT_FLOAT))
                    .Input(FakeInput(DT_FLOAT))
+                   .Attr("exponential_avg_factor", 1.0)
                    .Attr("epsilon", 0.001)
                    .Attr("is_training", true)
                    .Finalize(node_def()));
@@ -67,6 +68,41 @@ TEST_F(FusedBatchNormOpTest, Training) {
   test::ExpectTensorNear<float>(expected_variance, *GetOutput(2), 0.01);
 }
 
+TEST_F(FusedBatchNormOpTest, TrainingRunningMean) {
+  TF_EXPECT_OK(NodeDefBuilder("batch_norm_op", "FusedBatchNorm")
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("exponential_avg_factor", 0.5)
+                   .Attr("epsilon", 0.001)
+                   .Attr("is_training", true)
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
+                           {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
+  AddInputFromArray<float>(TensorShape({2}), {4.0, 4.0});
+  AddInputFromArray<float>(TensorShape({2}), {2.0, 2.0});
+  AddInputFromArray<float>(TensorShape({2}), {6.0, 6.0});
+  AddInputFromArray<float>(TensorShape({2}), {16.0, 16.0});
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
+  test::FillValues<float>(&expected, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
+                                      3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
+
+  Tensor expected_mean(allocator(), DT_FLOAT, TensorShape({2}));
+  test::FillValues<float>(&expected_mean, {8, 8});
+  test::ExpectTensorNear<float>(expected_mean, *GetOutput(1), 0.01);
+
+  Tensor expected_variance(allocator(), DT_FLOAT, TensorShape({2}));
+  test::FillValues<float>(&expected_variance, {15.00, 15.00});
+  test::ExpectTensorNear<float>(expected_variance, *GetOutput(2), 0.01);
+}
+
 TEST_F(FusedBatchNormOpTest, Inference) {
   TF_EXPECT_OK(NodeDefBuilder("batch_norm_op", "FusedBatchNorm")
                    .Input(FakeInput(DT_FLOAT))
@@ -93,6 +129,33 @@ TEST_F(FusedBatchNormOpTest, Inference) {
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
 }
 
+TEST_F(FusedBatchNormOpTest, InferenceIgnoreAvgFactor) {
+  TF_EXPECT_OK(NodeDefBuilder("batch_norm_op", "FusedBatchNorm")
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("exponential_avg_factor", 0.5)
+                   .Attr("epsilon", 0.001)
+                   .Attr("is_training", false)
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
+                           {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
+  AddInputFromArray<float>(TensorShape({2}), {4.0, 4.0});
+  AddInputFromArray<float>(TensorShape({2}), {2.0, 2.0});
+  AddInputFromArray<float>(TensorShape({2}), {10, 10});
+  AddInputFromArray<float>(TensorShape({2}), {11.67f, 11.67f});
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
+  test::FillValues<float>(&expected, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
+                                      3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
+}
+
 class FusedBatchNormGradOpTest : public OpsTestBase {};
 
 TEST_F(FusedBatchNormGradOpTest, Simple) {
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 82adb489f94..84f25347a86 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -179,6 +179,7 @@ REGISTER_OP("FusedBatchNorm")
     .Output("reserve_space_2: T")
     .Attr("T: {float}")
     .Attr("epsilon: float = 0.0001")
+    .Attr("exponential_avg_factor: float = 1.0")
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("is_training: bool = true")
     .SetShapeFn(shape_inference::FusedBatchNormShape);
@@ -197,6 +198,7 @@ REGISTER_OP("FusedBatchNormV2")
     .Attr("T: {half, bfloat16, float}")
     .Attr("U: {float}")
     .Attr("epsilon: float = 0.0001")
+    .Attr("exponential_avg_factor: float = 1.0")
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("is_training: bool = true")
     .SetShapeFn(shape_inference::FusedBatchNormShape);
@@ -216,6 +218,7 @@ REGISTER_OP("FusedBatchNormV3")
     .Attr("T: {half, bfloat16, float}")
     .Attr("U: {float}")
     .Attr("epsilon: float = 0.0001")
+    .Attr("exponential_avg_factor: float = 1.0")
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("is_training: bool = true")
     .SetShapeFn(shape_inference::FusedBatchNormV3Shape);
@@ -236,6 +239,7 @@ REGISTER_OP("_FusedBatchNormEx")
     .Attr("T: {half, float}")
     .Attr("U: {float}")
     .Attr("epsilon: float = 0.0001")
+    .Attr("exponential_avg_factor: float = 1.0")
     .Attr("num_side_inputs: int >= 0 = 0")
     .Attr("activation_mode: string = \"Identity\"")
     .Attr(GetConvnetDataFormatAttrString())
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index cf8bf14e42d..853f67c12de 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -1590,7 +1590,7 @@ tf_module {
   }
   member_method {
     name: "FusedBatchNorm"
-    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'NHWC\', \'True\', \'None\'], "
+    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'exponential_avg_factor\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1\', \'NHWC\', \'True\', \'None\'], "
   }
   member_method {
     name: "FusedBatchNormGrad"
@@ -1606,11 +1606,11 @@ tf_module {
   }
   member_method {
     name: "FusedBatchNormV2"
-    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'NHWC\', \'True\', \'None\'], "
+    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'exponential_avg_factor\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1\', \'NHWC\', \'True\', \'None\'], "
   }
   member_method {
     name: "FusedBatchNormV3"
-    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'NHWC\', \'True\', \'None\'], "
+    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'exponential_avg_factor\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1\', \'NHWC\', \'True\', \'None\'], "
   }
   member_method {
     name: "FusedPadConv2D"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index cf8bf14e42d..853f67c12de 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -1590,7 +1590,7 @@ tf_module {
   }
   member_method {
     name: "FusedBatchNorm"
-    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'NHWC\', \'True\', \'None\'], "
+    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'exponential_avg_factor\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1\', \'NHWC\', \'True\', \'None\'], "
   }
   member_method {
     name: "FusedBatchNormGrad"
@@ -1606,11 +1606,11 @@ tf_module {
   }
   member_method {
     name: "FusedBatchNormV2"
-    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'NHWC\', \'True\', \'None\'], "
+    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'exponential_avg_factor\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1\', \'NHWC\', \'True\', \'None\'], "
   }
   member_method {
     name: "FusedBatchNormV3"
-    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'NHWC\', \'True\', \'None\'], "
+    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'exponential_avg_factor\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1\', \'NHWC\', \'True\', \'None\'], "
   }
   member_method {
     name: "FusedPadConv2D"

From e04c53beb8ff31f5700766939e358acebc1d5d02 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Tue, 18 Feb 2020 13:54:08 -0800
Subject: [PATCH 170/442] Methods for generation of Winograd transformation
 matrices. Method to convert weights with Winograd transformation matrices.

PiperOrigin-RevId: 295814385
Change-Id: I26fc1bfe9f4ae01aacccc2dc59ccb1bf94975aea
---
 .../lite/delegates/gpu/cl/kernels/BUILD       |   2 +
 .../lite/delegates/gpu/cl/kernels/util.cc     | 125 ++++++++++++++++++
 .../lite/delegates/gpu/cl/kernels/util.h      |  13 ++
 .../lite/delegates/gpu/cl/kernels/winograd.cc |  91 +++++++------
 .../lite/delegates/gpu/cl/kernels/winograd.h  |   3 +-
 .../delegates/gpu/cl/kernels/winograd_test.cc |  93 +++++++++----
 6 files changed, 257 insertions(+), 70 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index 7be6a56d587..6b9bf5ce6e8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -1331,6 +1331,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1344,6 +1345,7 @@ cc_test(
     ],
     deps = [
         ":cl_test",
+        ":util",
         ":winograd",
         "//tensorflow/lite/delegates/gpu/cl:tensor_type",
         "//tensorflow/lite/delegates/gpu/common:operations",
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
index b0784b4c6d5..9b46c91b921 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
@@ -520,6 +520,131 @@ float4 GetMaskForLastPlane(int channels) {
   return mask;
 }
 
+namespace {
+// Matrices for Winograd trasformations received with method described here
+// https://openreview.net/pdf?id=H1ZaRZVKg
+std::vector<float> GetTransposedMatrixForWinograd(int width, int height) {
+  const float kDelta = std::sqrt(2.0f) / 2.0f;
+  std::vector<float> px(width);
+
+  px[0] = 0.0f;
+  const int points_count = (width - 1) / 2;
+  for (int i = 0; i < points_count; ++i) {
+    px[i * 2 + 1] = kDelta * (i + 1.0f);
+    px[i * 2 + 2] = -kDelta * (i + 1.0f);
+  }
+  px[width - 1] = 1.0f;
+
+  std::vector<float> py(width, 1.0f);
+  py[width - 1] = 0.0f;
+
+  std::vector<float> result(height * width);
+  for (int y = 0; y < width; ++y) {
+    for (int x = 0; x < height; ++x) {
+      result[x * width + y] =
+          std::pow(px[y], 1.0f * x) * std::pow(py[y], (height - 1.0f) - x);
+    }
+  }
+  return result;
+}
+
+std::vector<float> GetInversedMatrixForWinograd(int rank) {
+  auto matrix = GetTransposedMatrixForWinograd(rank, rank);
+  std::vector<float> inverted(rank * rank, 0.0f);
+  for (int i = 0; i < rank; ++i) {
+    inverted[i * rank + i] = 1.0f;
+  }
+
+  for (int i = 1; i < rank - 1; ++i) {
+    float inv_t = 1.0f / matrix[i * rank + i];
+    for (int x = i; x < rank; ++x) {
+      matrix[i * rank + x] *= inv_t;
+    }
+    for (int x = 0; x < rank; ++x) {
+      inverted[i * rank + x] *= inv_t;
+    }
+
+    for (int y = 0; y < rank; ++y) {
+      if (y == i) continue;
+      float t = matrix[y * rank + i];
+      for (int x = i; x < rank; ++x) {
+        matrix[y * rank + x] -= t * matrix[i * rank + x];
+      }
+      for (int x = 0; x < rank; ++x) {
+        inverted[y * rank + x] -= t * inverted[i * rank + x];
+      }
+    }
+  }
+
+  return inverted;
+}
+
+std::vector<float> Multiply(const std::vector<float>& a_mat,
+                            const std::vector<float>& b_mat, int m, int n,
+                            int k) {
+  std::vector<float> result(m * k);
+  for (int y = 0; y < m; ++y) {
+    for (int x = 0; x < k; ++x) {
+      float sum = 0.0f;
+      for (int i = 0; i < n; ++i) {
+        sum += a_mat[y * n + i] * b_mat[i * k + x];
+      }
+      result[y * k + x] = sum;
+    }
+  }
+  return result;
+}
+}  // namespace
+
+std::vector<float> AtMatrixForWinograd4x4To6x6() {
+  return GetTransposedMatrixForWinograd(6, 4);
+}
+
+std::vector<float> BtMatrixForWinograd4x4To6x6() {
+  return GetInversedMatrixForWinograd(6);
+}
+
+void RearrangeWeightsToWinograd4x4To6x6Weights(
+    const ::tflite::gpu::Tensor<OHWI, DataType::FLOAT32>& src_weights,
+    ::tflite::gpu::Tensor<OHWI, DataType::FLOAT32>* dst_weights) {
+  OHWI dst_shape;
+  dst_shape.o = src_weights.shape.o;
+  dst_shape.h = 6;
+  dst_shape.w = 6;
+  dst_shape.i = src_weights.shape.i;
+  dst_weights->shape = dst_shape;
+  dst_weights->data.resize(dst_shape.DimensionsProduct());
+
+  auto gt_mat = GetTransposedMatrixForWinograd(6, 3);
+  std::vector<float> g_mat(gt_mat.size());
+  for (int y = 0; y < 3; ++y) {
+    for (int x = 0; x < 6; ++x) {
+      g_mat[x * 3 + y] = gt_mat[y * 6 + x];
+    }
+  }
+
+  for (int d = 0; d < src_weights.shape.o; ++d) {
+    for (int s = 0; s < src_weights.shape.i; ++s) {
+      std::vector<float> in_vals(9);
+      for (int y = 0; y < 3; ++y) {
+        for (int x = 0; x < 3; ++x) {
+          const int f_index = src_weights.shape.LinearIndex({d, y, x, s});
+          in_vals[y * 3 + x] = src_weights.data[f_index];
+        }
+      }
+
+      auto temp_vals = Multiply(g_mat, in_vals, 6, 3, 3);
+      auto out_vals = Multiply(temp_vals, gt_mat, 6, 3, 6);
+      for (int y = 0; y < 6; ++y) {
+        for (int x = 0; x < 6; ++x) {
+          const int f_index = dst_shape.LinearIndex({d, y, x, s});
+          dst_weights->data[f_index] = out_vals[y * 6 + x];
+        }
+      }
+    }
+  }
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.h b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
index 0d0c7b793c3..14ad9ec0bc3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
@@ -244,6 +244,19 @@ void RearrangeWeightsToOHWIOGroupI4O4(
   }
 }
 
+// Matrices for Winograd trasformations received with method described here
+// https://openreview.net/pdf?id=H1ZaRZVKg
+
+// returns A transposed matrix(6 * 4) as array (24 values) for Winograd4x4To6x6
+std::vector<float> AtMatrixForWinograd4x4To6x6();
+
+// returns B transposed matrix(6 * 6) as array (36 values) for Winograd4x4To6x6
+std::vector<float> BtMatrixForWinograd4x4To6x6();
+
+void RearrangeWeightsToWinograd4x4To6x6Weights(
+    const ::tflite::gpu::Tensor<OHWI, DataType::FLOAT32>& src_weights,
+    ::tflite::gpu::Tensor<OHWI, DataType::FLOAT32>* dst_weights);
+
 // Returns fastest TextureAddressMode that return ZERO for out-of-range image
 // coordinates.
 //
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
index a6402779ff7..cfc172055ab 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/str_format.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
@@ -48,16 +49,17 @@ std::string GetWinograd4x4To36Code(
       src_tensor_type == TensorStorageType::IMAGE_BUFFER;
   const bool is_buffer = src_tensor_type == TensorStorageType::BUFFER;
 
-  c += R"(
-  constant FLT Bt[36] = {
-   1.0000000000f,  0.0000000887f, -2.3075673580f,  0.0000000089f,  0.8519787788f, -0.0000000000f,
-  -0.0000000000f,  0.9057970643f,  1.2307025194f, -0.4180375934f, -0.5679858327f,  0.0000000000f,
-   0.0000000000f, -0.9057970643f,  1.2307025194f,  0.4180375934f, -0.5679858327f, -0.0000000000f,
-  -0.0000000000f, -0.1132246330f, -0.0769189075f,  0.2090187818f,  0.1419964582f,  0.0000000000f,
-   0.0000000000f,  0.1132246330f, -0.0769189224f, -0.2090187967f,  0.1419964582f, -0.0000000000f,
-  -0.0000000000f,  1.1737382412f, -0.0000000532f, -2.7084801197f, -0.0000000355f,  1.0000000000f,
-};
-)";
+  auto bt_mat = BtMatrixForWinograd4x4To6x6();
+  c += "constant FLT Bt[36] = {\n";
+  for (int y = 0; y < 6; ++y) {
+    c += "\t";
+    for (int x = 0; x < 6; ++x) {
+      c += absl::StrFormat("%.10f", bt_mat[y * 6 + x]) + "f, ";
+    }
+    c += "\n";
+  }
+  c += "};\n";
+
   c += "__kernel void main_function(\n";
   c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
   c += bt_arr.GetDeclaration();
@@ -211,14 +213,17 @@ std::string GetWinograd36To4x4Code(
   const std::string batch_id = op_def.IsBatchSupported() ? "batch_id" : "";
   std::string c = GetCommonDefines(op_def.precision);
 
-  c += R"(
-constant FLT At[24] = {
-  1.0000000000f, 1.0000000000f,  1.0000000000f, 1.0000000000f,  1.0000000000f, 0.0000000000f,
-  0.0000000000f, 0.7360000014f, -0.7360000014f, 1.4720000029f, -1.4720000029f, 0.0000000000f,
-  0.0000000000f, 0.5416960120f,  0.5416960120f, 2.1667840481f,  2.1667840481f, 0.0000000000f,
-  0.0000000000f, 0.3986882567f, -0.3986882567f, 3.1895060539f, -3.1895060539f, 1.0000000000f,
-};
-)";
+  auto at_mat = AtMatrixForWinograd4x4To6x6();
+  c += "constant FLT At[24] = {\n";
+  for (int y = 0; y < 4; ++y) {
+    c += "\t";
+    for (int x = 0; x < 6; ++x) {
+      c += absl::StrFormat("%.10f", at_mat[y * 6 + x]) + "f, ";
+    }
+    c += "\n";
+  }
+  c += "};\n";
+
   c += "__kernel void main_function(\n";
   c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
   c += at_arr.GetDeclaration() + ",\n";
@@ -341,26 +346,23 @@ Status Winograd4x4To36::Compile(const CreationContext& creation_context) {
 }
 
 Status Winograd4x4To36::UploadBt(CLContext* context) {
-  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> Bt;
-  Bt.shape = Linear(48);
-  Bt.data = {1.0000000000f,  0.0000000887f,  -2.3075673580f, 0.0000000089f,
-             0.8519787788f,  0.0000000000f,  0.0000000000f,  0.0000000000f,
-             0.0000000000f,  0.9057970643f,  1.2307025194f,  -0.4180375934f,
-             -0.5679858327f, 0.0000000000f,  0.0000000000f,  0.0000000000f,
-             0.0000000000f,  -0.9057970643f, 1.2307025194f,  0.4180375934f,
-             -0.5679858327f, -0.0000000000f, 0.0000000000f,  0.0000000000f,
-             0.0000000000f,  -0.1132246330f, -0.0769189075f, 0.2090187818f,
-             0.1419964582f,  0.0000000000f,  0.0000000000f,  0.0000000000f,
-             0.0000000000f,  0.1132246330f,  -0.0769189224f, -0.2090187967f,
-             0.1419964582f,  0.0000000000f,  0.0000000000f,  0.0000000000f,
-             0.0000000000f,  1.1737382412f,  -0.0000000532f, -2.7084801197f,
-             -0.0000000355f, 1.0000000000f,  0.0000000000f,  0.0000000000f};
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> bt_aligned;
+  bt_aligned.shape = Linear(6 * 8);
+  bt_aligned.data.resize(6 * 8);
+  auto bt_mat = BtMatrixForWinograd4x4To6x6();
+  for (int y = 0; y < 6; ++y) {
+    for (int x = 0; x < 6; ++x) {
+      bt_aligned.data[y * 8 + x] = bt_mat[y * 6 + x];
+    }
+    bt_aligned.data[y * 8 + 6] = 0.0f;
+    bt_aligned.data[y * 8 + 7] = 0.0f;
+  }
 
   LinearStorageCreateInfo create_info;
   create_info.storage_type = LinearStorageType::TEXTURE_2D;
   create_info.data_type = definition_.GetDataType();
   create_info.name = "bt_arr";
-  return CreateLinearStorage(create_info, Bt, context, &bt_);
+  return CreateLinearStorage(create_info, bt_aligned, context, &bt_);
 }
 
 Status Winograd4x4To36::BindArguments() {
@@ -436,22 +438,23 @@ Status Winograd36To4x4::Compile(const CreationContext& creation_context) {
 }
 
 Status Winograd36To4x4::UploadAt(CLContext* context) {
-  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> At;
-  At.shape = Linear(32);
-  At.data = {1.0000000000f,  1.0000000000f, 1.0000000000f,  1.0000000000f,
-             1.0000000000f,  0.0000000000f, 0.0000000000f,  0.0000000000f,
-             0.0000000000f,  0.7360000014f, -0.7360000014f, 1.4720000029f,
-             -1.4720000029f, 0.0000000000f, 0.0000000000f,  0.0000000000f,
-             0.0000000000f,  0.5416960120f, 0.5416960120f,  2.1667840481f,
-             2.1667840481f,  0.0000000000f, 0.0000000000f,  0.0000000000f,
-             0.0000000000f,  0.3986882567f, -0.3986882567f, 3.1895060539f,
-             -3.1895060539f, 1.0000000000f, 0.0000000000f,  0.0000000000f};
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> at_aligned;
+  at_aligned.shape = Linear(4 * 8);
+  at_aligned.data.resize(4 * 8);
+  auto at_mat = AtMatrixForWinograd4x4To6x6();
+  for (int y = 0; y < 4; ++y) {
+    for (int x = 0; x < 6; ++x) {
+      at_aligned.data[y * 8 + x] = at_mat[y * 6 + x];
+    }
+    at_aligned.data[y * 8 + 6] = 0.0f;
+    at_aligned.data[y * 8 + 7] = 0.0f;
+  }
 
   LinearStorageCreateInfo create_info;
   create_info.storage_type = LinearStorageType::TEXTURE_2D;
   create_info.data_type = definition_.GetDataType();
   create_info.name = "at_arr";
-  return CreateLinearStorage(create_info, At, context, &at_);
+  return CreateLinearStorage(create_info, at_aligned, context, &at_);
 }
 
 Status Winograd36To4x4::BindArguments() {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
index 630d2c92faa..baa758ac6d8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
@@ -30,8 +30,7 @@ namespace cl {
 
 // You can read https://arxiv.org/pdf/1509.09308.pdf for understanding of basic
 // principles. In this kernels used different matrices for transformations than
-// in original work. Matrices received with method described here
-// https://openreview.net/pdf?id=H1ZaRZVKg
+// in original work.
 class Winograd4x4To36 : public GPUOperation {
  public:
   Winograd4x4To36() = default;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
index ba0f9d6c74a..3f0a6ceff74 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -37,8 +39,41 @@ namespace {
 TEST_F(OpenCLOperationTest, Winograd4x4To36) {
   TensorFloat32 src_tensor;
   src_tensor.shape = BHWC(1, 4, 4, 1);
-  src_tensor.data = {0.0f, 1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,
-                     8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
+  src_tensor.data.resize(16);
+  for (int i = 0; i < 16; ++i) {
+    src_tensor.data[i] = sin(i);
+  }
+
+  TensorFloat32 dst_ref;
+  dst_ref.shape = BHWC(1, 36, 1, 1);
+  dst_ref.data.resize(36, 0.0f);
+  auto b_t = BtMatrixForWinograd4x4To6x6();
+
+  // Bt * Src * B
+  // 1: temp = Src * B
+  std::vector<float> temp(36, 0.0f);
+  for (int y = 0; y < 6; ++y) {
+    for (int x = 0; x < 6; ++x) {
+      float sum = 0.0f;
+      for (int i = 0; i < 6; ++i) {
+        if (y < 1 || y > 4 || i < 1 || i > 4) continue;
+        const int index = src_tensor.shape.LinearIndex({0, y - 1, i - 1, 0});
+        sum += src_tensor.data[index] * b_t[x * 6 + i];
+      }
+      temp[y * 6 + x] = sum;
+    }
+  }
+  // 2: ref = Bt * temp
+  for (int y = 0; y < 6; ++y) {
+    for (int x = 0; x < 6; ++x) {
+      float sum = 0.0f;
+      for (int i = 0; i < 6; ++i) {
+        sum += b_t[y * 6 + i] * temp[i * 6 + x];
+      }
+      const int index = dst_ref.shape.LinearIndex({0, y * 6 + x, 0, 0});
+      dst_ref.data[index] = sum;
+    }
+  }
 
   for (auto storage : env_.GetSupportedStorages()) {
     for (auto precision : env_.GetSupportedPrecisions()) {
@@ -57,20 +92,7 @@ TEST_F(OpenCLOperationTest, Winograd4x4To36) {
           CreateWinograd4x4To36(creation_context_, op_def, padding, &wino_up));
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &wino_up,
                                     BHWC(1, 36, 1, 1), &dst_tensor));
-      EXPECT_THAT(dst_tensor.data,
-                  Pointwise(FloatNear(eps),
-                            {-1.8076144457f, 3.0488157272f,   -0.3543013334f,
-                             -0.9567713737f, 0.0698715150f,   6.3601350784f,
-                             7.9091277122f,  -7.5317668915f,  -0.4988912344f,
-                             0.0400028825f,  0.0815277994f,   1.8058515787f,
-                             -2.0690131187f, 1.4405870438f,   0.3173895180f,
-                             0.3676810265f,  -0.0566446260f,  -3.1750767231f,
-                             -4.4264192581f, 3.3195235729f,   0.5952118039f,
-                             0.6170299053f,  -0.1053467616f,  -5.5806870461f,
-                             0.3939223289f,  -0.2771621346f,  -0.0594099388f,
-                             -0.0679424182f, 0.0105922129f,   0.5897778869f,
-                             31.1582794189f, -22.9188480377f, -4.3477787971f,
-                             -4.6630558968f, 0.7714096308f,   41.5681838989f}));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), dst_ref.data));
     }
   }
 }
@@ -90,6 +112,36 @@ TEST_F(OpenCLOperationTest, Winograd36To4x4) {
     biases.data[i] = 0.0f;
   }
 
+  TensorFloat32 dst_ref;
+  dst_ref.shape = BHWC(1, 4, 4, 1);
+  dst_ref.data.resize(16, 0.0f);
+  auto a_t = AtMatrixForWinograd4x4To6x6();
+
+  // At * Src * A
+  // 1: temp = Src * A
+  std::vector<float> temp(24, 0.0f);
+  for (int y = 0; y < 6; ++y) {
+    for (int x = 0; x < 4; ++x) {
+      float sum = 0.0f;
+      for (int i = 0; i < 6; ++i) {
+        const int index = src_tensor.shape.LinearIndex({0, y * 6 + i, 0, 0});
+        sum += src_tensor.data[index] * a_t[x * 6 + i];
+      }
+      temp[y * 4 + x] = sum;
+    }
+  }
+  // 2: ref = At * temp
+  for (int y = 0; y < 4; ++y) {
+    for (int x = 0; x < 4; ++x) {
+      float sum = 0.0f;
+      for (int i = 0; i < 6; ++i) {
+        sum += a_t[y * 6 + i] * temp[i * 4 + x];
+      }
+      const int index = dst_ref.shape.LinearIndex({0, y, x, 0});
+      dst_ref.data[index] = sum;
+    }
+  }
+
   for (auto storage : env_.GetSupportedStorages()) {
     for (auto precision : env_.GetSupportedPrecisions()) {
       const float eps = precision == CalculationsPrecision::F32 ? 1e-5f : 1e-2f;
@@ -104,14 +156,7 @@ TEST_F(OpenCLOperationTest, Winograd36To4x4) {
           CreateWinograd36To4x4(creation_context_, op_def, biases, &wino_down));
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &wino_down,
                                     BHWC(1, 4, 4, 1), &dst_tensor));
-      EXPECT_THAT(
-          dst_tensor.data,
-          Pointwise(
-              FloatNear(eps),
-              {5.6982488632f, 4.4291338921f, 7.1398024559f, 8.3108062744f,
-               0.2751901150f, 0.6380079389f, -1.6235249043f, 0.6435587406f,
-               5.8707995415f, 3.3895490170f, 12.8032960892f, 7.8921923637f,
-               1.2864947319f, 1.1310911179f, 1.0033880472f, 1.9512135983f}));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), dst_ref.data));
     }
   }
 }

From cd206d6e069a73594c5b14dc6539f8258453f257 Mon Sep 17 00:00:00 2001
From: "T.J. Alumbaugh" <talumbau@google.com>
Date: Tue, 18 Feb 2020 13:55:16 -0800
Subject: [PATCH 171/442] Automated rollback of commit
 992b5eb9facc724592661bfdf22e6f5765a0c63c

PiperOrigin-RevId: 295814687
Change-Id: Icf77a2db160d23f5f109f1214b342fb0ee8b8bba
---
 tensorflow/lite/kernels/internal/optimized/optimized_ops.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 721da4eca3f..abb712ddf60 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -1313,19 +1313,12 @@ inline void HybridConv(const ConvParams& params, float* scaling_factors_ptr,
 
   std::fill_n(output_data, output_rows * output_cols, 0.0f);
 
-#ifdef TFLITE_WITH_RUY_GEMV
   // The scratch buffer must have the same size as the output.
   TFLITE_DCHECK_EQ(accum_scratch_shape.FlatSize(), output_shape.FlatSize());
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       filter_data, filter_rows, filter_cols, gemm_input_data,
       scaling_factors_ptr, /*n_batch=*/gemm_input_rows, accum_scratch,
       output_data, /*result_stride=*/1, context);
-#else
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      filter_data, filter_rows, filter_cols, gemm_input_data,
-      scaling_factors_ptr, /*n_batch=*/gemm_input_rows, output_data,
-      /*result_stride=*/1);
-#endif
   AddBiasAndEvalActivationFunction(output_activation_min, output_activation_max,
                                    bias_shape, bias_data, output_shape,
                                    output_data);

From d7087d362bce491e5d0c1d80668370465887f3c3 Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliuai@google.com>
Date: Tue, 18 Feb 2020 13:59:04 -0800
Subject: [PATCH 172/442] Add one pattern to remove the quantize->dequantize
 pairs for the floating-point constants

The existence of this pattern indicates the user op doesn't have sufficient
quantization parameters to be quantized. Then we should keep the floating-point
constants as float.

PiperOrigin-RevId: 295815623
Change-Id: I0adcaeecd8b71c381c5e9b6f7266eb07b31513e9
---
 .../lite/quantization/quantization_utils.h    |  3 +-
 .../compiler/mlir/lite/tests/quantize.mlir    | 37 +++++++++++--------
 .../mlir/lite/transforms/quantize_patterns.td |  8 ++++
 3 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
index 749ee7a9f57..ed998510328 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
@@ -150,7 +150,8 @@ struct QuantizationPattern : public RewritePattern {
 
   explicit QuantizationPattern(MLIRContext* context, bool enable_verify,
                                float error_tolerance, bool single_layer_verify)
-      : RewritePattern(DQ::getOperationName(), 1, context),
+      // Set the score to a large number so it is always preferred.
+      : RewritePattern(DQ::getOperationName(), 300, context),
         enable_verify(enable_verify),
         error_tolerance(error_tolerance),
         single_layer_verify(single_layer_verify) {}
diff --git a/tensorflow/compiler/mlir/lite/tests/quantize.mlir b/tensorflow/compiler/mlir/lite/tests/quantize.mlir
index 89d1e7cb7f4..0261644e6de 100644
--- a/tensorflow/compiler/mlir/lite/tests/quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/quantize.mlir
@@ -2,39 +2,44 @@
 // RUN: tf-opt %s -tfl-prepare-quantize -tfl-quantize -tfl-numeric-verify | FileCheck --check-prefix=DEBUG %s
 
 // CHECK-LABEL: QuantizeFloatConst
-func @QuantizeFloatConst() -> tensor<f32> {
+func @QuantizeFloatConst() -> tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>> {
   %0 = constant dense<-0.1> : tensor<2x2xf32>
-  %1 = "tfl.quantize"(%0) {qtype = tensor<!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>} : (tensor<2x2xf32>) -> tensor<!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>
-  %2 = "tfl.dequantize"(%1) : (tensor<!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>) -> tensor<f32>
-  return %2 : tensor<f32>
+  %1 = "tfl.quantize"(%0) {qtype = tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>
+  return %1 : tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>
 
-// CHECK:  %[[cst:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>, value = dense<0> : tensor<2x2xi8>}
-// CHECK:  %[[dq:.*]] = "tfl.dequantize"(%[[cst]])
-// CHECK:  return %[[dq]] : tensor<f32>
+// CHECK:  %[[cst:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>, value = dense<0> : tensor<2x2xi8>}
+// CHECK:  return %[[cst]]
 }
 
 // CHECK-LABEL: QuantizeDenseFloatConst
-func @QuantizeDenseFloatConst() -> tensor<2x2xf32> {
+func @QuantizeDenseFloatConst() -> tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>> {
   %0 = constant dense<[[-0.1, 1.0], [1.0, 3.0]]> : tensor<2x2xf32>
   %1 = "tfl.quantize"(%0) {qtype = tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>
-  %2 = "tfl.dequantize"(%1) : (tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>) -> tensor<2x2xf32>
-  return %2 : tensor<2x2xf32>
+  return %1 : tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>
 
 // CHECK:  %[[cst:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>, value = dense<{{\[\[}}0, -1], {{\[}}-1, -1]]> : tensor<2x2xi8>}
-// CHECK:  %[[dq:.*]] = "tfl.dequantize"(%[[cst]])
-// CHECK:  return %[[dq]] : tensor<2x2xf32>
+// CHECK:  return %[[cst]]
 }
 
 // CHECK-LABEL: QuantizeSplatFloatConst
-func @QuantizeSplatFloatConst() -> tensor<2x2xf32> {
+func @QuantizeSplatFloatConst() -> tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>> {
   %0 = constant dense<3.0> : tensor<2x2xf32>
   %1 = "tfl.quantize"(%0) {qtype = tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>
+  return %1 : tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>
+
+// CHECK:  %[[cst:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>, value = dense<-1> : tensor<2x2xi8>}
+// CHECK:  return %[[cst]]
+}
+
+// CHECK-LABEL: NotQuantizeFloatConst
+func @NotQuantizeFloatConst() -> tensor<2x2xf32> {
+  %0 = constant dense<-0.1> : tensor<2x2xf32>
+  %1 = "tfl.quantize"(%0) {qtype = tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>
   %2 = "tfl.dequantize"(%1) : (tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>) -> tensor<2x2xf32>
   return %2 : tensor<2x2xf32>
 
-// CHECK:  %[[cst:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<2x2x!quant.uniform<u8:f32, 7.8431372549019615E-4:128>>, value = dense<-1> : tensor<2x2xi8>}
-// CHECK:  %[[dq:.*]] = "tfl.dequantize"(%[[cst]])
-// CHECK:  return %[[dq]] : tensor<2x2xf32>
+// CHECK:  %[[cst:.*]] = constant dense<-1.000000e-01> : tensor<2x2xf32>
+// CHECK:  return %[[cst]] : tensor<2x2xf32>
 }
 
 // CHECK-LABEL: DequantizeAndQuantize
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td
index 5f61ae3efc3..07dd8ab4455 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize_patterns.td
@@ -21,12 +21,20 @@ include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
 
 // Quantize attribute $0 by using quantization parameter from %1.
 def QuantizeByQuantizedType : NativeCodeCall<"quant::Quantize($0, $1.getValue())">;
+def F32ElementsAttr : ElementsAttrBase<
+  CPred<"$_self.cast<ElementsAttr>().getType().getElementType().isF32()">, "float constant tensor">;
 
 // Squash tfl.dequantize and tfl.quantize pairs.
 // TODO(fengliuai): Compare the scale of input and output. This can also be
 // squashed to a requantize op if the scales are different.
 def : Pat<(TFL_QuantizeOp (TFL_DequantizeOp $in), $qt), (replaceWithValue $in)>;
 
+// If the tfl.dequantize op wasn't fused, we shouldn't quantize the floating
+// point constant.
+def : Pat<(TFL_DequantizeOp
+             (TFL_QuantizeOp (ConstantOp F32ElementsAttr:$cst), $qt)),
+          (ConstantOp $cst)>;
+
 // Quantize the value of a constant op if the quantization parameters have been
 // propagated to the output.
 def : Pat<(TFL_QuantizeOp

From ffb61470b82fce2283fc34d46bc189d30090a138 Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Tue, 18 Feb 2020 14:12:04 -0800
Subject: [PATCH 173/442] Use JoinPath in resource_loader's implementation.

This enables correct path handling on Windows.

PiperOrigin-RevId: 295819038
Change-Id: I0b755393d97c69e08e0ed89f2204087572ab8427
---
 tensorflow/core/platform/default/BUILD              | 1 +
 tensorflow/core/platform/default/resource_loader.cc | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/platform/default/BUILD b/tensorflow/core/platform/default/BUILD
index 2f056bf75f4..07a057718cb 100644
--- a/tensorflow/core/platform/default/BUILD
+++ b/tensorflow/core/platform/default/BUILD
@@ -326,6 +326,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:path",
         "@bazel_tools//tools/cpp/runfiles",
     ],
 )
diff --git a/tensorflow/core/platform/default/resource_loader.cc b/tensorflow/core/platform/default/resource_loader.cc
index 423ac4a3d8d..09c0e7cabee 100644
--- a/tensorflow/core/platform/default/resource_loader.cc
+++ b/tensorflow/core/platform/default/resource_loader.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/platform/resource_loader.h"
 
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/path.h"
 #include "tools/cpp/runfiles/runfiles.h"
 
 using bazel::tools::cpp::runfiles::Runfiles;
@@ -30,8 +31,7 @@ std::string GetDataDependencyFilepath(const std::string& relative_path) {
     LOG(FATAL) << "Unable to access the data dependencies of this test.\n"
                   "Make sure you are running this test using bazel.";
   }
-  string root_dir = "org_tensorflow/";
-  return runfiles->Rlocation(root_dir + relative_path);
+  return runfiles->Rlocation(io::JoinPath("org_tensorflow", relative_path));
 }
 
 }  // namespace tensorflow

From 8264abb627cbe687bf8816755f2297f0dc06287f Mon Sep 17 00:00:00 2001
From: Jeremy Lau <lauj@google.com>
Date: Tue, 18 Feb 2020 14:14:08 -0800
Subject: [PATCH 174/442] Reduce aggregate_ops' dependencies by moving
 OP_REQUIRES macro definitions to a new op_requires.h

PiperOrigin-RevId: 295819544
Change-Id: If9ac368d7bcc0eadca38a73aff2f86dd0220c87a
---
 tensorflow/core/framework/BUILD          |  9 +++
 tensorflow/core/framework/op_kernel.h    | 52 +--------------
 tensorflow/core/framework/op_requires.h  | 81 ++++++++++++++++++++++++
 tensorflow/core/kernels/BUILD            | 12 ++++
 tensorflow/core/kernels/aggregate_ops.cc |  1 +
 tensorflow/core/kernels/aggregate_ops.h  |  2 +-
 6 files changed, 105 insertions(+), 52 deletions(-)
 create mode 100644 tensorflow/core/framework/op_requires.h

diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index f3207dd657a..8ba4b03d803 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -51,6 +51,7 @@ exports_files(
         "node_def_builder.h",
         "numeric_op.h",
         "op_kernel.h",
+        "op_requires.h",
         "op_segment.h",
         "ops_util.h",
         "partial_tensor_shape.h",
@@ -180,6 +181,7 @@ filegroup(
         "op_def_builder.h",
         "op_def_util.h",
         "op_kernel.h",
+        "op_requires.h",
         "op_segment.h",
         "ops_util.h",
         "partial_tensor_shape.h",
@@ -351,6 +353,7 @@ filegroup(
         "op_def_util.h",
         "op_kernel.cc",
         "op_kernel.h",
+        "op_requires.h",
         "op_segment.cc",
         "op_segment.h",
         "ops_util.cc",
@@ -931,6 +934,12 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "op_requires",
+    hdrs = ["op_requires.h"],
+    deps = ["//tensorflow/core/platform:macros"],
+)
+
 # Files whose users still need to be migrated from core:framework to the
 # above targets.
 # TODO(gonnet): Remove these files once targets depending on them have
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index e0d9742768a..9e22321b42c 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/node_properties.h"
 #include "tensorflow/core/framework/op.h"  // TODO(b/62899350): Remove
+#include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/selective_registration.h"
 #include "tensorflow/core/framework/session_state.h"
@@ -1776,19 +1777,6 @@ inline void OpOutputList::set_ref(int i, mutex* mu, Tensor* tensor_for_ref) {
   ctx_->set_output_ref(i, mu, tensor_for_ref);
 }
 
-// Convenience macros for asserting and handling exceptional conditions.
-// Analogous to the CHECK* macros provided by logging.h.
-//
-// Example use:
-// void Compute(OperationContext* context) {
-//   OP_REQUIRES(context, context->num_inputs() == 2,
-//               errors::InvalidArgument("FooOp requires 2 arguments"));
-//   ...
-//   Status status = SomeUncertainMethod();
-//   OP_REQUIRES_OK(context, status);
-//   ...
-// }
-
 // Generate a fatal error if OP_REQUIRES or OP_REQUIRES_OK are used in
 // AsyncOpKernel implementations. If these macros are used and the condition
 // does not hold, the `done` callback will never be called and the system will
@@ -1802,44 +1790,6 @@ inline void CheckNotInComputeAsync(OpKernelConstruction*, const char*) {}
 void CheckNotInComputeAsync(OpKernelContext* ctx,
                             const char* correct_macro_name);
 
-#define OP_REQUIRES(CTX, EXP, STATUS)                     \
-  do {                                                    \
-    if (!TF_PREDICT_TRUE(EXP)) {                          \
-      CheckNotInComputeAsync((CTX), "OP_REQUIRES_ASYNC"); \
-      (CTX)->CtxFailure(__FILE__, __LINE__, (STATUS));    \
-      return;                                             \
-    }                                                     \
-  } while (0)
-
-#define OP_REQUIRES_OK(CTX, ...)                             \
-  do {                                                       \
-    ::tensorflow::Status _s(__VA_ARGS__);                    \
-    if (!TF_PREDICT_TRUE(_s.ok())) {                         \
-      CheckNotInComputeAsync((CTX), "OP_REQUIRES_OK_ASYNC"); \
-      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s);  \
-      return;                                                \
-    }                                                        \
-  } while (0)
-
-#define OP_REQUIRES_ASYNC(CTX, EXP, STATUS, CALLBACK)  \
-  do {                                                 \
-    if (!TF_PREDICT_TRUE(EXP)) {                       \
-      (CTX)->CtxFailure(__FILE__, __LINE__, (STATUS)); \
-      (CALLBACK)();                                    \
-      return;                                          \
-    }                                                  \
-  } while (0)
-
-#define OP_REQUIRES_OK_ASYNC(CTX, STATUS, CALLBACK)         \
-  do {                                                      \
-    ::tensorflow::Status _s(STATUS);                        \
-    if (!TF_PREDICT_TRUE(_s.ok())) {                        \
-      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \
-      (CALLBACK)();                                         \
-      return;                                               \
-    }                                                       \
-  } while (0)
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_OP_KERNEL_H_
diff --git a/tensorflow/core/framework/op_requires.h b/tensorflow/core/framework/op_requires.h
new file mode 100644
index 00000000000..ea80bfd7b2d
--- /dev/null
+++ b/tensorflow/core/framework/op_requires.h
@@ -0,0 +1,81 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_OP_REQUIRES_H_
+#define TENSORFLOW_CORE_FRAMEWORK_OP_REQUIRES_H_
+
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+// Convenience macros for asserting and handling exceptional conditions.
+// Analogous to the CHECK* macros provided by logging.h.
+//
+// Example use:
+// void Compute(OperationContext* context) {
+//   OP_REQUIRES(context, context->num_inputs() == 2,
+//               errors::InvalidArgument("FooOp requires 2 arguments"));
+//   ...
+//   Status status = SomeUncertainMethod();
+//   OP_REQUIRES_OK(context, status);
+//   ...
+// }
+//
+// These macros depend on CheckNotInComputeAsync, which must be defined before
+// invoking the macro. We specifically don't include op_kernel.h from this
+// header to reduce this header's dependencies. These macros may be used with
+// alternative implementations of OpKernelContext with fewer dependencies.
+
+#define OP_REQUIRES(CTX, EXP, STATUS)                     \
+  do {                                                    \
+    if (!TF_PREDICT_TRUE(EXP)) {                          \
+      CheckNotInComputeAsync((CTX), "OP_REQUIRES_ASYNC"); \
+      (CTX)->CtxFailure(__FILE__, __LINE__, (STATUS));    \
+      return;                                             \
+    }                                                     \
+  } while (0)
+
+#define OP_REQUIRES_OK(CTX, ...)                             \
+  do {                                                       \
+    ::tensorflow::Status _s(__VA_ARGS__);                    \
+    if (!TF_PREDICT_TRUE(_s.ok())) {                         \
+      CheckNotInComputeAsync((CTX), "OP_REQUIRES_OK_ASYNC"); \
+      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s);  \
+      return;                                                \
+    }                                                        \
+  } while (0)
+
+#define OP_REQUIRES_ASYNC(CTX, EXP, STATUS, CALLBACK)  \
+  do {                                                 \
+    if (!TF_PREDICT_TRUE(EXP)) {                       \
+      (CTX)->CtxFailure(__FILE__, __LINE__, (STATUS)); \
+      (CALLBACK)();                                    \
+      return;                                          \
+    }                                                  \
+  } while (0)
+
+#define OP_REQUIRES_OK_ASYNC(CTX, STATUS, CALLBACK)         \
+  do {                                                      \
+    ::tensorflow::Status _s(STATUS);                        \
+    if (!TF_PREDICT_TRUE(_s.ok())) {                        \
+      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \
+      (CALLBACK)();                                         \
+      return;                                               \
+    }                                                       \
+  } while (0)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_OP_REQUIRES_H_
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 409f52db948..e0004af3f17 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -3932,6 +3932,18 @@ tf_kernel_library(
     deps = MATH_DEPS,
 )
 
+cc_library(
+    name = "aggregate_ops_headers",
+    hdrs = [
+        "aggregate_ops.h",
+        "aggregate_ops_cpu.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_kernel_library(
     name = "argmax_op",
     prefix = "argmax_op",
diff --git a/tensorflow/core/kernels/aggregate_ops.cc b/tensorflow/core/kernels/aggregate_ops.cc
index 43337d68f84..511a5f77a66 100644
--- a/tensorflow/core/kernels/aggregate_ops.cc
+++ b/tensorflow/core/kernels/aggregate_ops.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/aggregate_ops.h"
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/aggregate_ops_cpu.h"
 
diff --git a/tensorflow/core/kernels/aggregate_ops.h b/tensorflow/core/kernels/aggregate_ops.h
index 30cccb22a19..5023d0dc8e7 100644
--- a/tensorflow/core/kernels/aggregate_ops.h
+++ b/tensorflow/core/kernels/aggregate_ops.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <numeric>
 
-#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_op_registry.h"

From e47e4bfb9ea32d779c606755edae31827572a724 Mon Sep 17 00:00:00 2001
From: Srinivas Vasudevan <srvasude@google.com>
Date: Tue, 18 Feb 2020 14:17:29 -0800
Subject: [PATCH 175/442] Improve numerics for Sinh, Asinh and Atanh in XLA.   
 - Rewrite Sinh,Asinh for smaller parameter regions so they return non-zero
 values for small x.    - Use Log1p in Atanh to retrieve non-zero values for
 small x.

PiperOrigin-RevId: 295820343
Change-Id: Ia330e201c2fac8497f3b021290550715cf067a81
---
 tensorflow/compiler/xla/client/lib/math.cc    | 49 ++++++++++++++++---
 .../compiler/xla/client/lib/math_test.cc      | 24 +++++++++
 2 files changed, 66 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index d0971734570..710ac478176 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -1008,12 +1008,23 @@ XlaOp Asinh(XlaOp x) {
     if (primitive_util::IsComplexType(shape.element_type())) {
       return Log(x + Sqrt(x * x + one));
     }
+    // For small x, sqrt(x**2 + 1) will evaluate to 1 due to floating point
+    // arithmetic. However, we would like to retain the low order term of this,
+    // which is around 0.5 * x**2 using a binomial expansion.
+    // Let z = sqrt(a**2 + 1)
+    // log(a + sqrt(a**2 + 1)) =
+    // log((a + sqrt(a**2 + 1)) * (1 + sqrt(a**2 + 1)) / (1 + sqrt(a**2 + 1))) =
+    // log((a + a**2 + 1 + a * z + z) / (1 + z)) =
+    // log(1 + a + a**2 / (1 + z)) =
+    // log(1 + a + a ** 2 / (1 + sqrt(a**2 + 1)))
+    // This rewrite retains the lower order term.
     auto a = Abs(x);
+    auto small_result = Log1p(a + a * a / (one + Sqrt(a * a + one)));
     auto naive_result = Log(a + Sqrt(a * a + one));
     auto overflow_result = Log(Abs(a)) + Log(ScalarLike(a, 2));
     auto sqrt_max_value = Sqrt(MaxFiniteValue(b, shape.element_type()));
-    return Sign(x) *
-           Select(Ge(a, sqrt_max_value), overflow_result, naive_result);
+    return Sign(x) * Select(Ge(a, sqrt_max_value), overflow_result,
+                            Select(Le(a, one), small_result, naive_result));
   };
   // These upcasts are not strictly necessary on all platforms to get within our
   // error tolerances, so we could relax this if it ever mattered.
@@ -1028,9 +1039,7 @@ XlaOp Atanh(XlaOp x) {
   XlaBuilder* b = x.builder();
   auto do_it = [&](XlaOp x) -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(auto shape, b->GetShape(x));
-    auto naive_result =
-        Log((ScalarLike(x, 1.0) + x) / (ScalarLike(x, 1.0) - x)) *
-        ScalarLike(x, 0.5);
+    auto naive_result = (Log1p(x) - Log1p(-x)) * ScalarLike(x, 0.5);
 
     // TODO(jlebar): For now, we ignore the nan edge case for complex inputs,
     // because we don't yet have exhaustive tests for complex trig functions.
@@ -1074,9 +1083,35 @@ XlaOp Cosh(XlaOp x) {
 // correct answer of 3.40281961e+38 (0x7f7fffec) is very close to max-float, so
 // we deem this acceptable.
 XlaOp Sinh(XlaOp x) {
-  return DoWithUpcastToF32(x, {BF16, F16}, [](XlaOp x) {
+  XlaBuilder* b = x.builder();
+  auto do_it = [&](XlaOp x) -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto shape, b->GetShape(x));
+    auto one_half = ScalarLike(x, 0.5);
     auto log_one_half = Log(ScalarLike(x, 0.5));
-    return Exp(x + log_one_half) - Exp(-x + log_one_half);
+    auto large_sinh_result = Exp(x + log_one_half) - Exp(-x + log_one_half);
+
+    if (primitive_util::IsComplexType(shape.element_type())) {
+      return large_sinh_result;
+    }
+
+    // Here we use e^x = e^(x / 2) * e^(x / 2). This avoids overflow for large
+    // values of x.
+
+    // For smaller x, we get unwanted cancellations of e^x - e^-x, resulting in
+    // 0.
+    // Rewrite this to avoid that. We use expm1(x) because that preserves the
+    // first order term of the taylor series of e^x.
+    // (e^(x) - e^(-x)) / 2. =
+    // (e^(x) - 1 + 1 - e^(-x)) / 2.
+    // (expm1(x) + (e^(x) - 1) / e^x) / 2.
+    // (expm1(x) + expm1(x) / (expm1(x) + 1)) / 2.
+    auto expm1 = Expm1(x);
+    auto one = ScalarLike(x, 1.);
+    auto small_sinh_result = one_half * (expm1 + expm1 / (expm1 + one));
+    return Select(Lt(Abs(x), one), small_sinh_result, large_sinh_result);
+  };
+  return DoWithUpcastToF32(x, {BF16, F16}, [&](XlaOp x) {
+    return b->ReportErrorOrReturn(do_it(x));
   });
 }
 
diff --git a/tensorflow/compiler/xla/client/lib/math_test.cc b/tensorflow/compiler/xla/client/lib/math_test.cc
index faf30f68a10..32796dd8d70 100644
--- a/tensorflow/compiler/xla/client/lib/math_test.cc
+++ b/tensorflow/compiler/xla/client/lib/math_test.cc
@@ -298,6 +298,30 @@ XLA_TEST_F(MathTest, SqrtSixValues) {
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
+XLA_TEST_F(MathTest, SinhSmallValues) {
+  XlaBuilder builder(TestName());
+  auto x = ConstantR1<float>(&builder, {1e-3, 1e-5, 1e-7, 1e-9, 1e-11});
+  Sinh(x);
+  std::vector<float> expected = {1e-3, 1e-5, 1e-7, 1e-9, 1e-11};
+  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+}
+
+XLA_TEST_F(MathTest, AsinhSmallValues) {
+  XlaBuilder builder(TestName());
+  auto x = ConstantR1<float>(&builder, {1e-3, 1e-5, 1e-7, 1e-9, 1e-11});
+  Asinh(x);
+  std::vector<float> expected = {1e-3, 1e-5, 1e-7, 1e-9, 1e-11};
+  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+}
+
+XLA_TEST_F(MathTest, AtanhSmallValues) {
+  XlaBuilder builder(TestName());
+  auto x = ConstantR1<float>(&builder, {1e-8, 1e-9, 1e-10, 1e-11});
+  Atanh(x);
+  std::vector<float> expected = {1e-8, 1e-9, 1e-10, 1e-11};
+  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+}
+
 XLA_TEST_F(MathTest, Lgamma) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(&builder, {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.5, 1.5,

From 79fdda5ead75a16e7b44d5574e2708586adfcaf9 Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Tue, 18 Feb 2020 14:22:17 -0800
Subject: [PATCH 176/442] Use GetDataDependencyFilepath and JoinPath to find
 data files.

This enables proper windows path support.

PiperOrigin-RevId: 295821454
Change-Id: I9a9163d0d3b5a12d2bc944cbb338e99cd6a86142
---
 tensorflow/compiler/xla/tests/BUILD               |  5 +++--
 tensorflow/compiler/xla/tests/sample_file_test.cc | 10 ++++++----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index bf2a1d64476..68c5538b1db 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -2429,15 +2429,16 @@ tf_cc_test(
     tags = tf_cuda_tests_tags(),
     deps = [
         ":hlo_test_base",
+        ":literal_test_util",
+        ":xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/service:cpu_plugin",  # reference backend
         "//tensorflow/compiler/xla/service:gpu_plugin",  # test backend
         "//tensorflow/compiler/xla/service:platform_util",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:resource_loader",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/tests/sample_file_test.cc b/tensorflow/compiler/xla/tests/sample_file_test.cc
index 31b104f4e37..d793dfc7960 100644
--- a/tensorflow/compiler/xla/tests/sample_file_test.cc
+++ b/tensorflow/compiler/xla/tests/sample_file_test.cc
@@ -25,6 +25,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -41,10 +43,10 @@ class SampleFileTest : public HloTestBase {
 };
 
 TEST_F(SampleFileTest, Convolution) {
-  const string& filename = "compiler/xla/tests/isolated_convolution.hlo";
-  string test_srcdir = tensorflow::testing::TensorFlowSrcRoot();
-  EXPECT_TRUE(RunAndCompareFromFile(
-      tensorflow::io::JoinPath(test_srcdir, filename), ErrorSpec{0.01}));
+  const string& filename = tensorflow::GetDataDependencyFilepath(
+      tensorflow::io::JoinPath("tensorflow", "compiler", "xla", "tests",
+                               "isolated_convolution.hlo"));
+  EXPECT_TRUE(RunAndCompareFromFile(filename, ErrorSpec{0.01}));
 }
 
 }  // namespace

From b391cb55c2861f1cf57311f85b4a893604fea3af Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 14:23:36 -0800
Subject: [PATCH 177/442] Internal change

PiperOrigin-RevId: 295821819
Change-Id: I7307e94062e3020ec26896634cfc23041773ff8e
---
 tensorflow/python/keras/layers/convolutional.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index b4cb8fe5f42..519915808e4 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -124,6 +124,8 @@ class Conv(Layer):
         activity_regularizer=regularizers.get(activity_regularizer),
         **kwargs)
     self.rank = rank
+    if filters is not None and not isinstance(filters, int):
+      filters = int(filters)
     self.filters = filters
     self.kernel_size = conv_utils.normalize_tuple(
         kernel_size, rank, 'kernel_size')

From f396035891b0938364ea247a7dd243a147930c6e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 14:36:47 -0800
Subject: [PATCH 178/442] Upgrade and rename external dependency grpc in
 workspace for bazel.

Fixes #33758

Downstream projects depending on TensorFlow: If bazel complains, please substitute `@zlib_archive` with `@zlib`, and `@grpc` with `@com_github_grpc_grpc` in WORKPLACE.

PiperOrigin-RevId: 295824868
Change-Id: If2259d59e9d82543369e5670916b1398374c9889
---
 WORKSPACE                                     | 25 ++++++++++++++++++
 tensorflow/BUILD                              |  8 +++---
 tensorflow/core/BUILD                         |  4 +--
 tensorflow/core/debug/BUILD                   |  2 +-
 tensorflow/core/lib/io/BUILD                  |  6 ++---
 tensorflow/core/lib/png/BUILD                 |  2 +-
 tensorflow/core/platform/BUILD                |  2 +-
 .../core/platform/default/build_config/BUILD  |  4 +--
 tensorflow/tensorflow.bzl                     |  2 +-
 tensorflow/tools/ci_build/ci_sanity.sh        | 24 +++++++++++++++--
 tensorflow/tools/lib_package/BUILD            | 14 +++++-----
 tensorflow/tools/pip_package/BUILD            |  8 +++---
 tensorflow/workspace.bzl                      | 26 +++++++++----------
 third_party/curl.BUILD                        |  2 +-
 third_party/llvm/llvm.autogenerated.BUILD     |  2 +-
 third_party/png.BUILD                         |  2 +-
 third_party/protobuf/protobuf.patch           |  2 +-
 third_party/systemlibs/syslibs_configure.bzl  |  4 +--
 18 files changed, 91 insertions(+), 48 deletions(-)

diff --git a/WORKSPACE b/WORKSPACE
index 0139c4aa643..ad645add449 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -113,3 +113,28 @@ http_archive(
         "https://storage.googleapis.com/download.tensorflow.org/models/speech_commands_v0.01.zip",
     ],
 )
+
+# Required for dependency @com_github_grpc_grpc
+
+load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
+
+grpc_deps()
+
+load(
+    "@build_bazel_rules_apple//apple:repositories.bzl",
+    "apple_rules_dependencies",
+)
+
+apple_rules_dependencies()
+
+load(
+    "@build_bazel_apple_support//lib:repositories.bzl",
+    "apple_support_dependencies",
+)
+
+apple_support_dependencies()
+
+load("@upb//bazel:repository_defs.bzl", "bazel_version_repository")
+
+bazel_version_repository(name = "bazel_version")
+
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 31efafb7801..55406a5686a 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -547,8 +547,8 @@ cc_library(
     name = "grpc",
     visibility = ["//visibility:public"],
     deps = select({
-        ":linux_s390x": ["@grpc//:grpc_unsecure"],
-        "//conditions:default": ["@grpc"],
+        ":linux_s390x": ["@com_github_grpc_grpc//:grpc_unsecure"],
+        "//conditions:default": ["@com_github_grpc_grpc//:grpc"],
     }),
 )
 
@@ -556,8 +556,8 @@ cc_library(
     name = "grpc++",
     visibility = ["//visibility:public"],
     deps = select({
-        ":linux_s390x": ["@grpc//:grpc++_unsecure"],
-        "//conditions:default": ["@grpc//:grpc++"],
+        ":linux_s390x": ["@com_github_grpc_grpc//:grpc++_unsecure"],
+        "//conditions:default": ["@com_github_grpc_grpc//:grpc++"],
     }),
 )
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 4f0df417037..5002f80c059 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1997,7 +1997,7 @@ cc_library(
         "//tensorflow/core/util:env_var",
         "//tensorflow/core/util:reporter",  # TODO(gunan): REMOVE as soon as cc_shared_library is supported.
         "@snappy",
-        "@zlib_archive//:zlib",
+        "@zlib",
         "@double_conversion//:double-conversion",
         "@com_google_protobuf//:protobuf",
     ] + tf_protos_all_impl() + tf_protos_grappler_impl() + tf_protos_profiler_impl(),
@@ -3077,7 +3077,7 @@ tf_cc_tests(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:optional",
-        "@zlib_archive//:zlib",
+        "@zlib",
     ],
 )
 
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 4cf8bc3588e..d9dfbc16677 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -38,7 +38,7 @@ package(
 # Check that tensorflow/core:tensorflow does not depend on grpc.
 check_deps(
     name = "core_tensorflow_check_deps",
-    disallowed_deps = ["@grpc//:grpc++"],
+    disallowed_deps = ["@com_github_grpc_grpc//:grpc++"],
     deps = ["//tensorflow/core:tensorflow"],
 )
 
diff --git a/tensorflow/core/lib/io/BUILD b/tensorflow/core/lib/io/BUILD
index 68dff3009fa..87b5090a59f 100644
--- a/tensorflow/core/lib/io/BUILD
+++ b/tensorflow/core/lib/io/BUILD
@@ -240,7 +240,7 @@ cc_library(
     hdrs = ["zlib_compression_options.h"],
     deps = [
         "//tensorflow/core/platform:types",
-        "@zlib_archive//:zlib",
+        "@zlib",
     ],
     alwayslink = True,
 )
@@ -258,7 +258,7 @@ cc_library(
         "//tensorflow/core/platform:macros",
         "//tensorflow/core/platform:strcat",
         "//tensorflow/core/platform:types",
-        "@zlib_archive//:zlib",
+        "@zlib",
     ],
     alwayslink = True,
 )
@@ -275,7 +275,7 @@ cc_library(
         "//tensorflow/core/platform:env",
         "//tensorflow/core/platform:macros",
         "//tensorflow/core/platform:types",
-        "@zlib_archive//:zlib",
+        "@zlib",
     ],
     alwayslink = True,
 )
diff --git a/tensorflow/core/lib/png/BUILD b/tensorflow/core/lib/png/BUILD
index db2ab4801ee..7abc82e6a0f 100644
--- a/tensorflow/core/lib/png/BUILD
+++ b/tensorflow/core/lib/png/BUILD
@@ -22,7 +22,7 @@ cc_library(
         "//tensorflow/core/platform:stringpiece",
         "//tensorflow/core/platform:types",
         "@com_google_absl//absl/base",
-        "@zlib_archive//:zlib",
+        "@zlib",
     ],
 )
 
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index b992f1abdfb..1b03357f48e 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -899,7 +899,7 @@ tf_cc_tests(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:optional",
-        "@zlib_archive//:zlib",
+        "@zlib",
     ],
 )
 
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index 7545bc5b2c0..20f0e9e42d9 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -153,7 +153,7 @@ cc_library(
         "@farmhash_archive//:farmhash",
         "@fft2d",
         "@highwayhash//:sip_hash",
-        "@zlib_archive//:zlib",
+        "@zlib",
     ],
 )
 
@@ -178,7 +178,7 @@ cc_library(
     copts = tf_copts(),
     deps = [
         "@png",
-        "@zlib_archive//:zlib",
+        "@zlib",
     ],
 )
 
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 2beac63feb4..f86010cef2a 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1758,7 +1758,7 @@ def transitive_hdrs(name, deps = [], **kwargs):
 #
 # For:
 #   * Eigen: it's a header-only library.  Add it directly to your deps.
-#   * GRPC: add a direct dep on @grpc//:grpc++_public_hdrs.
+#   * GRPC: add a direct dep on @com_github_grpc_grpc//:grpc++_public_hdrs.
 #
 def cc_header_only_library(name, deps = [], includes = [], extra_deps = [], **kwargs):
     _transitive_hdrs(name = name + "_gather", deps = deps)
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 7189a636a29..9397bbd4f60 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -357,12 +357,32 @@ do_external_licenses_check(){
 
   # Blacklist
   echo ${MISSING_LICENSES_FILE}
-  grep -e "@bazel_tools//third_party/" -e "@bazel_tools//tools" -e "@local" -e "@com_google_absl//absl" -e "@org_tensorflow//" -e "@com_github_googlecloudplatform_google_cloud_cpp//google" -v ${MISSING_LICENSES_FILE} > temp.txt
+  grep \
+    -e "@bazel_tools//third_party/" \
+    -e "@bazel_tools//tools" \
+    -e "@local" \
+    -e "@com_google_absl//absl" \
+    -e "@org_tensorflow//" \
+    -e "@com_github_googlecloudplatform_google_cloud_cpp//google" \
+    -e "@com_github_grpc_grpc//src/compiler" \
+    -e "@platforms//os" \
+    -v ${MISSING_LICENSES_FILE} > temp.txt
   mv temp.txt ${MISSING_LICENSES_FILE}
 
   # Whitelist
   echo ${EXTRA_LICENSE_FILE}
-  grep -e "//third_party/mkl_dnn" -e "@bazel_tools//src" -e "@bazel_tools//tools/" -e "@org_tensorflow//tensorflow" -e "@com_google_absl//" -e "//external" -e "@local" -e "@com_github_googlecloudplatform_google_cloud_cpp//" -e "@embedded_jdk//" -e "^//$" -v ${EXTRA_LICENSES_FILE} > temp.txt
+  grep \
+    -e "//third_party/mkl_dnn" \
+    -e "@bazel_tools//src" \
+    -e "@bazel_tools//tools/" \
+    -e "@org_tensorflow//tensorflow" \
+    -e "@com_google_absl//" \
+    -e "//external" \
+    -e "@local" \
+    -e "@com_github_googlecloudplatform_google_cloud_cpp//" \
+    -e "@embedded_jdk//" \
+    -e "^//$" \
+    -v ${EXTRA_LICENSES_FILE} > temp.txt
   mv temp.txt ${EXTRA_LICENSES_FILE}
 
 
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index fb88a61b424..d68d8c333b5 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -164,7 +164,7 @@ genrule(
         "@six_archive//:LICENSE",
         "@snappy//:COPYING",
         "@sobol_data//:LICENSE",
-        "@zlib_archive//:zlib.h",
+        "@zlib//:zlib.h",
     ] + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
@@ -200,10 +200,10 @@ genrule(
         "//third_party/mkl:LICENSE",
         "//third_party/mkl_dnn:LICENSE",
     ]) + if_not_system_lib(
-        "grpc",
+        "com_github_grpc_grpc",
         [
-            "@grpc//:LICENSE",
-            "@grpc//third_party/address_sorting:LICENSE",
+            "@com_github_grpc_grpc//:LICENSE",
+            "@com_github_grpc_grpc//third_party/address_sorting:LICENSE",
         ],
     ) + tf_additional_license_deps(),
     outs = ["THIRD_PARTY_TF_C_LICENSES"],
@@ -228,8 +228,8 @@ genrule(
         "@fft2d//:fft2d/readme2d.txt",
         "@gemmlowp//:LICENSE",
         "@gif//:COPYING",
-        "@grpc//:LICENSE",
-        "@grpc//third_party/address_sorting:LICENSE",
+        "@com_github_grpc_grpc//:LICENSE",
+        "@com_github_grpc_grpc//third_party/address_sorting:LICENSE",
         "@highwayhash//:LICENSE",
         "@icu//:icu4j/main/shared/licenses/LICENSE",
         "@libjpeg_turbo//:LICENSE.md",
@@ -244,7 +244,7 @@ genrule(
         "@six_archive//:LICENSE",
         "@snappy//:COPYING",
         "@sobol_data//:LICENSE",
-        "@zlib_archive//:zlib.h",
+        "@zlib//:zlib.h",
     ] + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index f6e17a6e46c..c50dea89482 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -190,7 +190,7 @@ filegroup(
         "@sobol_data//:LICENSE",
         "@swig//:LICENSE",
         "@termcolor_archive//:COPYING.txt",
-        "@zlib_archive//:zlib.h",
+        "@zlib//:zlib.h",
     ] + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
@@ -235,10 +235,10 @@ filegroup(
             "@absl_py//absl/third_party/unittest3_backport:LICENSE",
         ],
     ) + if_not_system_lib(
-        "grpc",
+        "com_github_grpc_grpc",
         [
-            "@grpc//:LICENSE",
-            "@grpc//third_party/address_sorting:LICENSE",
+            "@com_github_grpc_grpc//:LICENSE",
+            "@com_github_grpc_grpc//third_party/address_sorting:LICENSE",
         ],
     ) + if_ngraph([
         "@ngraph//:LICENSE",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index dfe6a9e4499..6d74a7fed92 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -476,8 +476,6 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     PROTOBUF_SHA256 = "b9e92f9af8819bbbc514e2902aec860415b70209f31dfc8c4fa72515a5df9d59"
     PROTOBUF_STRIP_PREFIX = "protobuf-310ba5ee72661c081129eb878c1bbcec936b20f0"
 
-    # protobuf depends on @zlib, it has to be renamed to @zlib_archive because "zlib" is already
-    # defined using bind for grpc.
     PROTOBUF_PATCH = "//third_party/protobuf:protobuf.patch"
 
     tf_http_archive(
@@ -562,20 +560,20 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     # WARNING: make sure ncteisen@ and vpai@ are cc-ed on any CL to change the below rule
     tf_http_archive(
-        name = "grpc",
-        sha256 = "67a6c26db56f345f7cee846e681db2c23f919eba46dd639b09462d1b6203d28c",
-        strip_prefix = "grpc-4566c2a29ebec0835643b972eb99f4306c4234a3",
+        name = "com_github_grpc_grpc",
+        sha256 = "b956598d8cbe168b5ee717b5dafa56563eb5201a947856a6688bbeac9cac4e1f",
+        strip_prefix = "grpc-b54a5b338637f92bfcf4b0bc05e0f57a5fd8fadd",
         system_build_file = clean_dep("//third_party/systemlibs:grpc.BUILD"),
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/grpc/grpc/archive/4566c2a29ebec0835643b972eb99f4306c4234a3.tar.gz",
-            "https://github.com/grpc/grpc/archive/4566c2a29ebec0835643b972eb99f4306c4234a3.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/grpc/grpc/archive/b54a5b338637f92bfcf4b0bc05e0f57a5fd8fadd.tar.gz",
+            "https://github.com/grpc/grpc/archive/b54a5b338637f92bfcf4b0bc05e0f57a5fd8fadd.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "com_github_nanopb_nanopb",
         sha256 = "8bbbb1e78d4ddb0a1919276924ab10d11b631df48b657d960e0c795a25515735",
-        build_file = "@grpc//third_party:nanopb.BUILD",
+        build_file = "@com_github_grpc_grpc//third_party:nanopb.BUILD",
         strip_prefix = "nanopb-f8ac463766281625ad710900479130c7fcb4d63b",
         urls = [
             "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nanopb/nanopb/archive/f8ac463766281625ad710900479130c7fcb4d63b.tar.gz",
@@ -649,7 +647,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     tf_http_archive(
-        name = "zlib_archive",
+        name = "zlib",
         build_file = clean_dep("//third_party:zlib.BUILD"),
         sha256 = "c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1",
         strip_prefix = "zlib-1.2.11",
@@ -1071,21 +1069,21 @@ def tf_bind():
     # Needed by Protobuf
     native.bind(
         name = "grpc_cpp_plugin",
-        actual = "@grpc//:grpc_cpp_plugin",
+        actual = "@com_github_grpc_grpc//src/compiler:grpc_cpp_plugin",
     )
     native.bind(
         name = "grpc_python_plugin",
-        actual = "@grpc//:grpc_python_plugin",
+        actual = "@com_github_grpc_grpc//src/compiler:grpc_python_plugin",
     )
 
     native.bind(
         name = "grpc_lib",
-        actual = "@grpc//:grpc++",
+        actual = "@com_github_grpc_grpc//:grpc++",
     )
 
     native.bind(
         name = "grpc_lib_unsecure",
-        actual = "@grpc//:grpc++_unsecure",
+        actual = "@com_github_grpc_grpc//:grpc++_unsecure",
     )
 
     # Needed by gRPC
@@ -1134,5 +1132,5 @@ def tf_bind():
     # Needed by gRPC
     native.bind(
         name = "zlib",
-        actual = "@zlib_archive//:zlib",
+        actual = "@zlib",
     )
diff --git a/third_party/curl.BUILD b/third_party/curl.BUILD
index 10316df91e3..f3a7e3f59e7 100644
--- a/third_party/curl.BUILD
+++ b/third_party/curl.BUILD
@@ -321,7 +321,7 @@ cc_library(
     }),
     visibility = ["//visibility:public"],
     deps = [
-        "@zlib_archive//:zlib",
+        "@zlib",
     ] + select({
         "@org_tensorflow//tensorflow:ios": [],
         "@org_tensorflow//tensorflow:windows": [],
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index 8c53968111b..c80a2d2fce2 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -3752,7 +3752,7 @@ cc_library(
     deps = [
         ":config",
         ":demangle",
-        "@zlib_archive//:zlib",
+        "@zlib",
     ],
 )
 
diff --git a/third_party/png.BUILD b/third_party/png.BUILD
index e82948648e4..719d4c7c670 100644
--- a/third_party/png.BUILD
+++ b/third_party/png.BUILD
@@ -54,7 +54,7 @@ cc_library(
         "//conditions:default": ["-lm"],
     }),
     visibility = ["//visibility:public"],
-    deps = ["@zlib_archive//:zlib"],
+    deps = ["@zlib"],
 )
 
 genrule(
diff --git a/third_party/protobuf/protobuf.patch b/third_party/protobuf/protobuf.patch
index efbe3340169..decd92e9d03 100644
--- a/third_party/protobuf/protobuf.patch
+++ b/third_party/protobuf/protobuf.patch
@@ -7,7 +7,7 @@ index 2fb26050..c2744d5b 100644
  ################################################################################
  
 -ZLIB_DEPS = ["@zlib//:zlib"]
-+ZLIB_DEPS = ["@zlib_archive//:zlib"]
++ZLIB_DEPS = ["@zlib"]
  
  ################################################################################
  # Protobuf Runtime Library
diff --git a/third_party/systemlibs/syslibs_configure.bzl b/third_party/systemlibs/syslibs_configure.bzl
index 0cfc289dffd..7a96fdf9d21 100644
--- a/third_party/systemlibs/syslibs_configure.bzl
+++ b/third_party/systemlibs/syslibs_configure.bzl
@@ -14,6 +14,7 @@ VALID_LIBS = [
     "boringssl",
     "com_github_googleapis_googleapis",
     "com_github_googlecloudplatform_google_cloud_cpp",
+    "com_github_grpc_grpc",
     "com_google_protobuf",
     "com_googlesource_code_re2",
     "curl",
@@ -24,7 +25,6 @@ VALID_LIBS = [
     "functools32_archive",
     "gast_archive",
     "gif",
-    "grpc",
     "hwloc",
     "icu",
     "jsoncpp_git",
@@ -42,7 +42,7 @@ VALID_LIBS = [
     "swig",
     "termcolor_archive",
     "wrapt",
-    "zlib_archive",
+    "zlib",
 ]
 
 def auto_configure_fail(msg):

From 666b51063f054eab82ea1355fd754712fed897b3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 14:38:01 -0800
Subject: [PATCH 179/442] Update ops-related pbtxt files.

PiperOrigin-RevId: 295825111
Change-Id: I164fd70aa77b4a03ee58162f5bfcfad429a016cc
---
 .../ops_history_v1/FusedBatchNorm.pbtxt       |  86 +++++++++++++++
 .../ops_history_v1/FusedBatchNormV2.pbtxt     |  97 +++++++++++++++++
 .../ops_history_v1/FusedBatchNormV3.pbtxt     | 101 ++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 |  21 ++++
 4 files changed, 305 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNorm.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNorm.pbtxt
index 9f30c2acf11..e5ac169b31e 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNorm.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNorm.pbtxt
@@ -77,3 +77,89 @@ op {
     }
   }
 }
+op {
+  name: "FusedBatchNorm"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "offset"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "mean"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "variance"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "batch_mean"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "batch_variance"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_1"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space_2"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "exponential_avg_factor"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormV2.pbtxt
index 170a90af2f5..99f482fc721 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormV2.pbtxt
@@ -88,3 +88,100 @@ op {
     }
   }
 }
+op {
+  name: "FusedBatchNormV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "offset"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "mean"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "variance"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "batch_mean"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "batch_variance"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_1"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_2"
+    type_attr: "U"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "U"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "exponential_avg_factor"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormV3.pbtxt
index f79e4938cb0..a28965d2db8 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v1/FusedBatchNormV3.pbtxt
@@ -92,3 +92,104 @@ op {
     }
   }
 }
+op {
+  name: "FusedBatchNormV3"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "offset"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "mean"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "variance"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "batch_mean"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "batch_variance"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_1"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_2"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_3"
+    type_attr: "U"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "U"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "exponential_avg_factor"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 949cb99542d..526a1bfb46c 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -16169,6 +16169,13 @@ op {
       f: 0.0001
     }
   }
+  attr {
+    name: "exponential_avg_factor"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
   attr {
     name: "data_format"
     type: "string"
@@ -16522,6 +16529,13 @@ op {
       f: 0.0001
     }
   }
+  attr {
+    name: "exponential_avg_factor"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
   attr {
     name: "data_format"
     type: "string"
@@ -16616,6 +16630,13 @@ op {
       f: 0.0001
     }
   }
+  attr {
+    name: "exponential_avg_factor"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
   attr {
     name: "data_format"
     type: "string"

From 59840cf101741aac00070a066259bf0b6d4d17ec Mon Sep 17 00:00:00 2001
From: Jing Pu <jingpu@google.com>
Date: Tue, 18 Feb 2020 14:39:28 -0800
Subject: [PATCH 180/442] Return "argX" for BlockArgument in
 OpOrArgLocNameMapper::GetName.

This is for printing better debugging information.

PiperOrigin-RevId: 295825438
Change-Id: I9b049656aa11a20692d328bcea9a8adf2a5bf1fd
---
 tensorflow/compiler/mlir/op_or_arg_name_mapper.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc b/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc
index babfb478881..63f558bc9c5 100644
--- a/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc
+++ b/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc
@@ -168,6 +168,10 @@ std::string OpOrArgLocNameMapper::GetName(OpOrVal op_or_val) {
                            result.getResultNumber());
     return std::string(result.getOwner()->getName().getStringRef());
   }
+  // Use the ASM syntax for BloackArgument
+  if (auto arg = val.dyn_cast<mlir::BlockArgument>()) {
+    return "arg" + std::to_string(arg.getArgNumber());
+  }
   return "";
 }
 

From c347ded23c5fa658bcd315b4fdaa5e09ed4e3ef4 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Tue, 18 Feb 2020 14:53:17 -0800
Subject: [PATCH 181/442] Add support to tpu-v1-island-coarsening for operation
 that calls other functions, like tf.While

tf.While is not annotated with the attribute, we need to consider it depending on the
operations inside the condition/body.

PiperOrigin-RevId: 295828493
Change-Id: If6e8f54178e579c4d09c0bf267e9c597407d5077
---
 .../executor_tpuv1_island_coarsening.mlir     |  0
 .../while_op.mlir                             | 57 +++++++++++
 .../executor_tpuv1_island_coarsening.cc       | 98 ++++++++++++++-----
 .../mlir/tensorflow/transforms/passes.h       |  3 +-
 4 files changed, 130 insertions(+), 28 deletions(-)
 rename tensorflow/compiler/mlir/tensorflow/tests/{ => executor_tpuv1_island_coarsening}/executor_tpuv1_island_coarsening.mlir (100%)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_coarsening/while_op.mlir

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_coarsening.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_coarsening/executor_tpuv1_island_coarsening.mlir
similarity index 100%
rename from tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_coarsening.mlir
rename to tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_coarsening/executor_tpuv1_island_coarsening.mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_coarsening/while_op.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_coarsening/while_op.mlir
new file mode 100644
index 00000000000..59ece992756
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_coarsening/while_op.mlir
@@ -0,0 +1,57 @@
+// RUN: tf-opt %s -tf-executor-tpu-v1-island-coarsening | FileCheck %s --dump-input=fail
+
+
+// Test that islands with a function call are merged if the call is to a function
+// that contains ops with the same attribute.
+// CHECK-LABEL: func @control_input
+func @control_input(%arg0 : tensor<i1>) -> tensor<i32> {
+  %0:6 = tf_executor.graph {
+    %1:2 = tf_executor.island wraps "tf.opA"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i1>) -> tensor<i32>
+    %2:2 = tf_executor.island wraps "tf.While"(%1#0) {name = "A", body = @while_body_with_cluster_attr, cond = @while_cond_with_cluster_attr, is_stateless = false, parallel_iterations = 10 : i64} : (tensor<i32>) -> tensor<i32>
+    %3:2 = tf_executor.island wraps "tf.While"(%1#0) {name = "B", body = @while_body_with_wrong_cluster_attr, cond = @while_cond_with_wrong_cluster_attr, is_stateless = false, parallel_iterations = 10 : i64} : (tensor<i32>) -> tensor<i32>
+    %4:2 = tf_executor.island wraps "tf.While"(%1#0) {name = "C", body = @while_body_without_cluster_attr, cond = @while_cond_with_cluster_attr, is_stateless = false, parallel_iterations = 10 : i64} : (tensor<i32>) -> tensor<i32>
+    %6:2 = tf_executor.island wraps "tf.While"(%1#0) {name = "D", body = @while_body_without_cluster_attr, cond = @while_cond_without_cluster_attr, is_stateless = false, parallel_iterations = 10 : i64} : (tensor<i32>) -> tensor<i32>
+    %5:2 = tf_executor.island wraps "tf.While"(%1#0) {name = "E", body = @while_body_with_cluster_attr, cond = @while_cond_without_cluster_attr, is_stateless = false, parallel_iterations = 10 : i64} : (tensor<i32>) -> tensor<i32>
+
+// CHECK: "tf.opA"
+// CHECK-NOT: island
+// CHECK: name = "A"
+// CHECK-NOT: island
+// CHECK: name = "C"
+// CHECK-NOT: island
+// CHECK: name = "E"
+// CHECK: island {{.*}}name = "B"
+// CHECK: island {{.*}}name = "D"
+
+    tf_executor.fetch %1#0, %2#0, %3#0, %4#0, %5#0, %6#0 : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>
+  }
+  return %0#0 : tensor<i32>
+}
+
+func @while_body_with_cluster_attr(%arg0: tensor<i32>) -> tensor<i32> {
+  %0 = "some.op"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
+  return %0 : tensor<i32>
+}
+func @while_cond_with_cluster_attr(%arg0: tensor<i32>) -> tensor<i1> {
+  %0 = "some.op"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+func @while_body_with_wrong_cluster_attr(%arg0: tensor<i32>) -> tensor<i32> {
+  %0 = "some.op"(%arg0) {_tpu_replicate = "wrong_cluster"} : (tensor<i32>) -> tensor<i32>
+  return %0 : tensor<i32>
+}
+func @while_cond_with_wrong_cluster_attr(%arg0: tensor<i32>) -> tensor<i1> {
+  %0 = "some.op"(%arg0) {_tpu_replicate = "wrong_cluster"} : (tensor<i32>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+func @while_body_without_cluster_attr(%arg0: tensor<i32>) -> tensor<i32> {
+  %0 = "some.op"(%arg0) : (tensor<i32>) -> tensor<i32>
+  return %0 : tensor<i32>
+}
+func @while_cond_without_cluster_attr(%arg0: tensor<i32>) -> tensor<i1> {
+  %0 = "some.op"(%arg0) : (tensor<i32>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_island_coarsening.cc b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_island_coarsening.cc
index cd669abcc24..cc87bd31486 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_island_coarsening.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_island_coarsening.cc
@@ -29,10 +29,12 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
+#include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Block.h"  // TF:llvm-project
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Location.h"  // TF:llvm-project
 #include "mlir/IR/Operation.h"  // TF:llvm-project
+#include "mlir/IR/SymbolTable.h"  // TF:llvm-project
 #include "mlir/IR/UseDefLists.h"  // TF:llvm-project
 #include "mlir/IR/Visitors.h"  // TF:llvm-project
 #include "mlir/Pass/Pass.h"  // TF:llvm-project
@@ -57,8 +59,8 @@ constexpr llvm::StringRef kTpuStatusAttr = "_tpu_compilation_status";
 // TPU-annotated operations and intended to preserve backward compatibility with
 // TFv1.
 struct TpuV1BridgeExecutorIslandCoarsening
-    : public FunctionPass<TpuV1BridgeExecutorIslandCoarsening> {
-  void runOnFunction() override;
+    : public ModulePass<TpuV1BridgeExecutorIslandCoarsening> {
+  void runOnModule() override;
 };
 
 // Sort the Operations in the provided range to enforce dominance.
@@ -88,9 +90,10 @@ LogicalResult SortTopologically(Block::iterator first_op,
           Operation* producer_in_block =
               block->findAncestorOpInBlock(*defining_op);
           if (producer_in_block && producer_in_block != &op &&
-              unscheduled_ops.count(producer_in_block))
+              unscheduled_ops.count(producer_in_block)) {
             // Found an operand that isn't scheduled yet, interrupt the walk.
             return WalkResult::interrupt();
+          }
         }
         return WalkResult::advance();
       });
@@ -113,7 +116,9 @@ LogicalResult SortTopologically(Block::iterator first_op,
 // A failure is returned if a cycle preventing the merge from happening
 // correctly without breaking dominance. The IR is left in invalid state in case
 // of failure.
-LogicalResult MergeIsland(Operation* op, bool* changed) {
+LogicalResult MergeIsland(llvm::function_ref<bool(StringAttr, Operation*)>
+                              is_op_calling_func_for_cluster,
+                          Operation* op, bool* changed) {
   // Find the first island wrapping a single operation with the `_tpu_replicate`
   // attribute, it'll be used as the root of the algorithm to find the other
   // operations that are part of the same cluster.
@@ -146,7 +151,9 @@ LogicalResult MergeIsland(Operation* op, bool* changed) {
     if (!candidate_cluster_name)
       candidate_cluster_name =
           candidate_wrapped_op.getAttrOfType<StringAttr>(kTpuStatusAttr);
-    if (candidate_cluster_name != cluster_name) continue;
+    if (candidate_cluster_name != cluster_name &&
+        !is_op_calling_func_for_cluster(cluster_name, &candidate_wrapped_op))
+      continue;
 
     // Look at captured operands to bring-in ReplicatedInputOp in the
     // island as well. TODO: also pull in tf.Const, some optimizations can
@@ -250,34 +257,71 @@ LogicalResult MergeIsland(Operation* op, bool* changed) {
                            first_op_after);
 }
 
-void TpuV1BridgeExecutorIslandCoarsening::runOnFunction() {
-  getFunction().walk([&](GraphOp graph) {
-    Block& graph_body = graph.GetBody();
+void TpuV1BridgeExecutorIslandCoarsening::runOnModule() {
+  SymbolTable symbol_table(getModule());
 
-    // Iterate until fixed point on the block, as it may contain multiple
-    // clusters.
-    bool changed = true;
-    while (changed) {
-      changed = false;
-      for (Operation& op : graph_body) {
-        if (failed(MergeIsland(&op, &changed))) {
-          graph.emitError() << "Merging island failed: the TPU cluster likely "
-                            << "contains a cycle with non-TPU operations\n";
-          signalPassFailure();
-          return WalkResult::interrupt();
-        }
-        // If islands were merged, restart scanning the block from the beginning
-        // as we lost track of where to continue.
-        if (changed) break;
-      }
+  // Map tpu cluster names to the functions that contain operations for this
+  // cluster.
+  DenseMap<StringRef, DenseSet<FuncOp>> tpu_funcs;
+  for (FuncOp func_op : getModule().getOps<FuncOp>()) {
+    func_op.walk([&](Operation* op) {
+      StringAttr cluster_name =
+          op->getAttrOfType<StringAttr>(kTpuReplicateAttr);
+      if (!cluster_name)
+        cluster_name = op->getAttrOfType<StringAttr>(kTpuStatusAttr);
+      if (!cluster_name) return;
+      tpu_funcs[cluster_name.getValue()].insert(func_op);
+    });
+  }
+
+  // Return true if the operation is containing a reference to a function
+  // containing operations for this cluster.
+  auto is_op_calling_func_for_cluster = [&](StringAttr cluster, Operation* op) {
+    auto funcs_for_cluster = tpu_funcs.find(cluster.getValue());
+    assert(funcs_for_cluster != tpu_funcs.end());
+    assert(!funcs_for_cluster->second.empty());
+    if (funcs_for_cluster->second.size() == 1) return false;
+    for (NamedAttribute attr : op->getAttrs()) {
+      auto symbol_ref = attr.second.dyn_cast<FlatSymbolRefAttr>();
+      if (!symbol_ref) continue;
+      FuncOp callee = symbol_table.lookup<FuncOp>(symbol_ref.getValue());
+      if (!callee) continue;
+      if (funcs_for_cluster->second.count(callee)) return true;
     }
-    return WalkResult::advance();
-  });
+    return false;
+  };
+
+  for (FuncOp func_op : getModule().getOps<FuncOp>()) {
+    func_op.walk([&](GraphOp graph) {
+      Block& graph_body = graph.GetBody();
+
+      // Iterate until fixed point on the block, as it may contain multiple
+      // clusters.
+      bool changed = true;
+      while (changed) {
+        changed = false;
+        for (Operation& op : graph_body) {
+          if (failed(
+                  MergeIsland(is_op_calling_func_for_cluster, &op, &changed))) {
+            graph.emitError()
+                << "Merging island failed: the TPU cluster likely "
+                << "contains a cycle with non-TPU operations\n";
+            signalPassFailure();
+            return WalkResult::interrupt();
+          }
+          // If islands were merged, restart scanning the block from the
+          // beginning as we lost track of where to continue.
+          if (changed) break;
+        }
+      }
+      return WalkResult::advance();
+    });
+  }
 }
 
 }  // namespace
 
-std::unique_ptr<OpPassBase<FuncOp>>
+std::unique_ptr<OpPassBase<ModuleOp>>
 CreateTFExecutorTPUV1IslandCoarseningPass() {
   return std::make_unique<TpuV1BridgeExecutorIslandCoarsening>();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index 02cdb9dc229..ad6fc683b6d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -106,7 +106,8 @@ std::unique_ptr<OpPassBase<FuncOp>> CreateTFExecutorIslandCoarseningPass();
 
 // Creates a pass to merge IslandOps for operation marked for execution on TPU.
 // This is a V1 backward compatibility.
-std::unique_ptr<OpPassBase<FuncOp>> CreateTFExecutorTPUV1IslandCoarseningPass();
+std::unique_ptr<OpPassBase<ModuleOp>>
+CreateTFExecutorTPUV1IslandCoarseningPass();
 
 // Creates a pass to outlining TPU clusters from single IslandOp into a nested
 // module suitable for being processed as-if it was a V2 module.

From 5c16c2c48a3ac44f20ab3dac2493b4c261915455 Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Tue, 18 Feb 2020 14:56:03 -0800
Subject: [PATCH 182/442] Automated rollback of commit
 36fe0e7aadccfcba4b5dd5ed35c9995dceb6e4b6

PiperOrigin-RevId: 295829087
Change-Id: I69c415ff72eeffda1a993e114ee8f3679710faac
---
 .../python/distribute/cross_device_ops.py     |  2 +-
 .../distribute/mirrored_strategy_test.py      |  6 +-
 .../distribute/parameter_server_strategy.py   |  2 +-
 tensorflow/python/distribute/values.py        | 57 ++++++++++---------
 tensorflow/python/saved_model/save.py         |  2 +-
 5 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index 4b2814eca3e..9d44f5c554c 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -1032,7 +1032,7 @@ class CollectiveAllReduce(CrossDeviceOps):
           else:
             # TODO(josh11b): Once we add support for model parallelism, get the
             # copy from the corresponding replica instead of the primary.
-            index.append(array_ops.identity(all_reduced.primary))
+            index.append(array_ops.identity(all_reduced._primary))  # pylint: disable=protected-access
     return value_lib.regroup(index, wrap_class=value_lib.Mirrored)
 
   def batch_reduce_implementation(self, reduce_op, value_destination_pairs):
diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py
index b2ab4bb6ec6..fa7e4a8fcd4 100644
--- a/tensorflow/python/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_strategy_test.py
@@ -1334,7 +1334,7 @@ class FunctionTest(test.TestCase):
     def forward(x, w, b):
       return x * w + b
     x = constant_op.constant([1.0], name="x_useless")
-    concrete_forward = forward.get_concrete_function(x, w.primary, b.primary)
+    concrete_forward = forward.get_concrete_function(x, w._primary, b._primary)
 
     with ms.scope():
       def replica_fn():
@@ -1350,8 +1350,8 @@ class FunctionTest(test.TestCase):
       g1, g2 = step_fn()
       run_metadata = context.export_run_metadata()
       context.disable_run_metadata()
-      self.assertEqual(self.evaluate(g1.primary), 1.0)
-      self.assertEqual(self.evaluate(g2.primary), 1.0)
+      self.assertEqual(self.evaluate(g1._primary), 1.0)
+      self.assertEqual(self.evaluate(g2._primary), 1.0)
 
       # Verify that this node runs on both devices.
       node_name = "gradients_mul_grad_mul_1_x"
diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py
index 41ea9e3fcb9..a807d4ae9ff 100644
--- a/tensorflow/python/distribute/parameter_server_strategy.py
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@@ -487,7 +487,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
     def _select_fn(x):  # pylint: disable=g-missing-docstring
       if isinstance(x, values.Mirrored):
         if len(x.devices) == 1:
-          return x.primary
+          return x._primary  # pylint: disable=protected-access
         else:
           raise ValueError(
               "You cannot update variable with a Mirrored object with multiple "
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 570c3c35cbf..fb3e2ffd817 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -75,7 +75,7 @@ class DistributedValues(object):
         "replica accesses.")
 
   def _get_closest(self):
-    """Returns value in same replica or device if possible, else the primary."""
+    """Returns value in same replica or device if possible, else the _primary."""
     replica_id = _get_current_replica_id_as_int()
     if replica_id is None:
       # Try to find a value on the current device.
@@ -83,12 +83,12 @@ class DistributedValues(object):
       for value in self._values:
         if device_util.canonicalize(value.device) == current_device:
           return value
-      return self.primary
+      return self._primary
     else:
       return self._values[replica_id]
 
   @property
-  def primary(self):
+  def _primary(self):
     """Returns a representative component."""
     return self._values[0]
 
@@ -368,7 +368,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
   def __init__(self, strategy, values):
     self._distribute_strategy = strategy
     super(DistributedVariable, self).__init__(values)
-    self._common_name = self.primary.name.split(":")[0]
+    self._common_name = self._primary.name.split(":")[0]
     # Use a weakref to make it easy to map from the contained values
     # to the container without introducing a reference cycle.
     for v in values:
@@ -395,7 +395,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
       The op that evaluates to True or False depending on if all the
       component variables are initialized.
     """
-    result = self.primary.is_initialized()
+    result = self._primary.is_initialized()
     # We iterate through the list of values except the last one to allow us to
     # name the final `logical_and` op the same name that is passed by the user
     # to the `is_initialized` op. For distributed variables, the
@@ -426,11 +426,11 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
 
   @property
   def constraint(self):
-    return self.primary.constraint
+    return self._primary.constraint
 
   @property
   def graph(self):
-    return self.primary.graph
+    return self._primary.graph
 
   @property
   def _shared_name(self):
@@ -438,28 +438,28 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
 
   @property
   def _unique_id(self):
-    return self.primary._unique_id  # pylint: disable=protected-access
+    return self._primary._unique_id  # pylint: disable=protected-access
 
   @property
   def _graph_key(self):
     """Lets Optimizers know which graph this variable is from."""
-    return self.primary._graph_key  # pylint: disable=protected-access
+    return self._primary._graph_key  # pylint: disable=protected-access
 
   @property
   def name(self):
-    return self.primary.name
+    return self._primary.name
 
   @property
   def dtype(self):
-    return self.primary.dtype
+    return self._primary.dtype
 
   @property
   def shape(self):
-    return self.primary.shape
+    return self._primary.shape
 
   @property
   def synchronization(self):
-    return self.primary.synchronization
+    return self._primary.synchronization
 
   @property
   def handle(self):
@@ -475,10 +475,10 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
 
   @property
   def _save_slice_info(self):
-    return self.primary._save_slice_info  # pylint: disable=protected-access
+    return self._primary._save_slice_info  # pylint: disable=protected-access
 
   def _get_save_slice_info(self):
-    return self.primary._get_save_slice_info()  # pylint: disable=protected-access
+    return self._primary._get_save_slice_info()  # pylint: disable=protected-access
 
   def _set_save_slice_info(self, save_slice_info):
     for v in self._values:
@@ -490,17 +490,17 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
 
   @property
   def trainable(self):
-    return self.primary.trainable
+    return self._primary.trainable
 
   @property
   def distribute_strategy(self):
     return self._distribute_strategy
 
   def get_shape(self):
-    return self.primary.get_shape()
+    return self._primary.get_shape()
 
   def to_proto(self, export_scope=None):
-    return self.primary.to_proto(export_scope=export_scope)
+    return self._primary.to_proto(export_scope=export_scope)
 
   @property
   def op(self):
@@ -508,13 +508,13 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
     # to work (even if the current device isn't in self.devices), but
     # other uses of var.op in a cross-replica context to fail.
     if distribution_strategy_context.in_cross_replica_context():
-      return DistributedVarOp(self.primary.op.name, self.primary.op.graph,
-                              self.primary.op.traceback, self.primary.op.type)
+      return DistributedVarOp(self._primary.op.name, self._primary.op.graph,
+                              self._primary.op.traceback, self._primary.op.type)
     return self._get().op
 
   @property
   def _in_graph_mode(self):
-    return self.primary._in_graph_mode  # pylint: disable=protected-access
+    return self._primary._in_graph_mode  # pylint: disable=protected-access
 
   def read_value(self):
     with _enter_or_assert_strategy(self._distribute_strategy):
@@ -567,7 +567,7 @@ class TPUVariableMixin(object):
     # Handle ID is needed for `get_replicated_var_handle` to cache the variables
     # correctly since in eager mode different variables can have the same name.
     if ops.executing_eagerly_outside_functions():
-      self._handle_id = self._common_name + "_" + str(id(self.primary))
+      self._handle_id = self._common_name + "_" + str(id(self._primary))
     else:
       self._handle_id = self._common_name
 
@@ -592,7 +592,7 @@ class TPUVariableMixin(object):
     if _enclosing_tpu_context() is None:
       return super(TPUVariableMixin, self)._get_closest()
     else:
-      return self.primary
+      return self._primary
 
   def numpy(self):
     if context.executing_eagerly():
@@ -644,8 +644,8 @@ class TPUVariableMixin(object):
 
   @property
   def op(self):
-    return DistributedVarOp(self.primary.op.name, self.primary.op.graph,
-                            self.primary.op.traceback, self.primary.op.type)
+    return DistributedVarOp(self._primary.op.name, self._primary.op.graph,
+                            self._primary.op.traceback, self._primary.op.type)
 
   def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
     """Converts a variable to a tensor."""
@@ -900,7 +900,7 @@ class MirroredVariable(DistributedVariable, Mirrored):
     """
 
     def _saveable_factory(name=self._common_name):
-      return _MirroredSaveable(self, self.primary, name)
+      return _MirroredSaveable(self, self._primary, name)
 
     return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
 
@@ -1003,7 +1003,8 @@ class _SyncOnReadSaveable(saver.BaseSaverBuilder.SaveableObject):
         slice_spec="",
         name=name,
         dtype=sync_on_read_variable.dtype,
-        device=sync_on_read_variable.primary.device)
+        device=sync_on_read_variable._primary.device)  # pylint: disable=protected-access
+
     super(_SyncOnReadSaveable, self).__init__(tensor, [spec], name)
 
   def restore(self, restored_tensors, restored_shapes):
@@ -1103,7 +1104,7 @@ class SyncOnReadVariable(DistributedVariable):
 
   def _get_cross_replica(self):
     if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-      return self.primary
+      return self._primary
 
     with _enter_or_assert_strategy(self._distribute_strategy):
       return self._distribute_strategy.reduce(
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index 617f5e83a01..ced4135526a 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -274,7 +274,7 @@ class _SaveableView(object):
         self.captured_tensor_node_ids[obj.resource_handle] = node_id
       elif (ds_values.is_distributed_variable(obj) or
             resource_variable_ops.is_resource_variable(obj)):
-        obj_to_copy = obj.primary if ds_values.is_distributed_variable(
+        obj_to_copy = obj._primary if ds_values.is_distributed_variable(  # pylint: disable=protected-access
             obj) else obj
         new_variable = resource_variable_ops.copy_to_graph_uninitialized(
             obj_to_copy)

From ed493143b14c31ebf16881a815e8904e6a82ff9a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 15:10:22 -0800
Subject: [PATCH 183/442] Automated rollback of commit
 6a202bc94b845ca4bb3f67884f3683ee2492e825

PiperOrigin-RevId: 295832353
Change-Id: I79feef342ad69ade7121b94c1c1a44e7c5d777b4
---
 .../python/ops/ragged/ragged_getitem.py       |  84 +++------------
 .../python/ops/ragged/ragged_tensor_test.py   | 100 ++----------------
 2 files changed, 21 insertions(+), 163 deletions(-)

diff --git a/tensorflow/python/ops/ragged/ragged_getitem.py b/tensorflow/python/ops/ragged/ragged_getitem.py
index ba4b13387b4..eca3cc3cdfa 100644
--- a/tensorflow/python/ops/ragged/ragged_getitem.py
+++ b/tensorflow/python/ops/ragged/ragged_getitem.py
@@ -19,12 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.eager import context
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_gather_ops
@@ -44,6 +41,9 @@ def ragged_tensor_getitem(self, key):
   principles of Python ("In the face of ambiguity, refuse the temptation to
   guess"), we simply disallow this operation.
 
+  Any dimensions added by `array_ops.newaxis` will be ragged if the following
+  dimension is ragged.
+
   Args:
     self: The RaggedTensor to slice.
     key: Indicates which piece of the RaggedTensor to return, using standard
@@ -134,26 +134,15 @@ def _ragged_getitem(rt_input, key_list):
   # that puts all values in a single row.
   if row_key is array_ops.newaxis:
     inner_rt = _ragged_getitem(rt_input, inner_keys)
-    nsplits = tensor_shape.dimension_at_index(inner_rt.row_splits.shape, 0)
-    if nsplits.value is not None:
-      nsplits = nsplits.value
-    else:
-      nsplits = array_ops.shape(inner_rt.row_splits,
-                                out_type=inner_rt.row_splits.dtype)[0]
-    return ragged_tensor.RaggedTensor.from_uniform_row_length(
-        inner_rt, nsplits - 1, nrows=1, validate=False)
+    nsplits = array_ops.shape(inner_rt.row_splits,
+                              out_type=inner_rt.row_splits.dtype)[0]
+    return ragged_tensor.RaggedTensor.from_row_splits(
+        inner_rt, array_ops.stack([0, nsplits - 1]), validate=False)
 
   # Slicing a range of rows: first slice the outer dimension, and then
   # call `_ragged_getitem_inner_dimensions` to handle the inner keys.
   if isinstance(row_key, slice):
     sliced_rt_input = _slice_ragged_row_dimension(rt_input, row_key)
-    if rt_input.uniform_row_length is not None:
-      # If the inner dimension has uniform_row_length, then preserve it (by
-      # re-wrapping the values in a new RaggedTensor).  Note that the row
-      # length won't have changed, since we're slicing a range of rows (and not
-      # slicing the rows themselves).
-      sliced_rt_input = ragged_tensor.RaggedTensor.from_uniform_row_length(
-          sliced_rt_input.values, rt_input.uniform_row_length)
     return _ragged_getitem_inner_dimensions(sliced_rt_input, inner_keys)
 
   # Indexing a single row: slice values to get the indicated row, and then
@@ -256,14 +245,11 @@ def _ragged_getitem_inner_dimensions(rt_input, key_list):
   # RaggedTensor that puts each value in its own row.
   if column_key is array_ops.newaxis:
     inner_rt = _ragged_getitem_inner_dimensions(rt_input, key_list[1:])
-    nsplits = tensor_shape.dimension_at_index(inner_rt.row_splits.shape, 0)
-    if nsplits.value is not None:
-      nsplits = nsplits.value
-    else:
-      nsplits = array_ops.shape(inner_rt.row_splits,
-                                out_type=inner_rt.row_splits.dtype)[0]
-    return ragged_tensor.RaggedTensor.from_uniform_row_length(
-        inner_rt, 1, nrows=nsplits - 1, validate=False)
+    nsplits = array_ops.shape(inner_rt.row_splits,
+                              out_type=inner_rt.row_splits.dtype)[0]
+    return ragged_tensor.RaggedTensor.from_row_splits(inner_rt,
+                                                      math_ops.range(nsplits),
+                                                      validate=False)
 
   # Slicing a range of columns in a ragged inner dimension.  We use a
   # recursive call to process the values, and then assemble a RaggedTensor
@@ -306,59 +292,15 @@ def _ragged_getitem_inner_dimensions(rt_input, key_list):
             lambda: math_ops.maximum(limits + stop_offset, lower_bound))
       inner_rt = _build_ragged_tensor_from_value_ranges(
           inner_rt_starts, inner_rt_limits, column_key.step, rt_input.values)
-      # If the row dimension is uniform, then calculate the new
-      # uniform_row_length, and rebuild inner_rt using that uniform_row_lengths.
-      if rt_input.uniform_row_length is not None:
-        new_row_length = _slice_length(rt_input.uniform_row_length, column_key)
-        inner_rt = ragged_tensor.RaggedTensor.from_uniform_row_length(
-            inner_rt.values, new_row_length, rt_input.nrows())
       return inner_rt.with_values(
           _ragged_getitem_inner_dimensions(inner_rt.values, key_list[1:]))
 
   # Indexing a single column in a ragged inner dimension: raise an Exception.
   # See RaggedTensor.__getitem__.__doc__ for an explanation of why indexing
   # into a ragged inner dimension is problematic.
-  if rt_input.uniform_row_length is None:
+  else:
     raise ValueError("Cannot index into an inner ragged dimension.")
 
-  # Indexing a single column in a uniform inner dimension: check that the
-  # given index is in-bounds, and then use a strided slice over rt_input.values
-  # to take the indicated element from each row.
-  row_length = rt_input.uniform_row_length
-  column_key = math_ops.cast(column_key, row_length.dtype)
-  oob_err_msg = "Index out of bounds when indexing into a ragged tensor"
-  oob_checks = [
-      check_ops.assert_greater_equal(
-          column_key, -row_length, message=oob_err_msg),
-      check_ops.assert_less(column_key, row_length, message=oob_err_msg),
-  ]
-  with ops.control_dependencies(oob_checks):
-    offset = _if_ge_zero(column_key, lambda: column_key,
-                         lambda: row_length + column_key)
-    sliced_rt = rt_input.values[offset::row_length]
-    return _ragged_getitem_inner_dimensions(sliced_rt, key_list[1:])
-
-
-def _slice_length(value_length, slice_key):
-  """Computes the number of elements in a slice of a value with a given length.
-
-  Returns the equivalent of: `len(range(value_length)[slice_key])`
-
-  Args:
-    value_length: Scalar int `Tensor`: the length of the value being sliced.
-    slice_key: A `slice` object used to slice elements from the the value.
-
-  Returns:
-    The number of elements in the sliced value.
-  """
-  # Note: we could compute the slice length without creating a zeros tensor
-  # with some variant of (stop-start)//step, but doing so would require more
-  # ops (for checking bounds, handling negative indices, negative step sizes,
-  # etc); and we expect this to be an uncommon operation, so we use this
-  # simpler implementation.
-  zeros = array_ops.zeros(value_length, dtype=dtypes.bool)
-  return array_ops.size(zeros[slice_key], out_type=value_length.dtype)
-
 
 def _expand_ellipsis(key_list, num_remaining_dims):
   """Expands the ellipsis at the start of `key_list`.
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test.py b/tensorflow/python/ops/ragged/ragged_tensor_test.py
index f4c75d26699..6bc066e5d84 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test.py
@@ -116,12 +116,6 @@ EXAMPLE_RAGGED_TENSOR_4D_VALUES = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10],
                                    [11, 12], [13, 14], [15, 16], [17, 18],
                                    [19, 20]]
 
-# Example 3D ragged tensor with uniform_row_lengths.
-EXAMPLE_RAGGED_TENSOR_3D = [[[1, 2, 3], [4], [5, 6]], [[], [7, 8, 9], []]]
-EXAMPLE_RAGGED_TENSOR_3D_ROWLEN = 3
-EXAMPLE_RAGGED_TENSOR_3D_SPLITS = [0, 3, 4, 6, 6, 9, 9]
-EXAMPLE_RAGGED_TENSOR_3D_VALUES = [1, 2, 3, 4, 5, 6, 7, 8, 9]
-
 
 def int32array(values):
   return np.array(values, dtype=np.int32)
@@ -843,7 +837,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
   # RaggedTensor.__getitem__
   #=============================================================================
 
-  def _TestGetItem(self, rt, slice_spec, expected, expected_shape=None):
+  def _TestGetItem(self, rt, slice_spec, expected):
     """Helper function for testing RaggedTensor.__getitem__.
 
     Checks that calling `rt.__getitem__(slice_spec) returns the expected value.
@@ -861,7 +855,6 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
       slice_spec: The slice spec.
       expected: The expected value of rt.__getitem__(slice_spec), as a python
         list; or an exception class.
-      expected_shape: The expected shape for `rt.__getitem__(slice_spec)`.
     """
     tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
     tensor_slice_spec2 = _make_tensor_slice_spec(slice_spec, False)
@@ -871,18 +864,13 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     self.assertAllEqual(value1, expected, 'slice_spec=%s' % (slice_spec,))
     self.assertAllEqual(value2, expected, 'slice_spec=%s' % (slice_spec,))
     self.assertAllEqual(value3, expected, 'slice_spec=%s' % (slice_spec,))
-    if expected_shape is not None:
-      value1.shape.assert_is_compatible_with(expected_shape)
-      value2.shape.assert_is_compatible_with(expected_shape)
-      value3.shape.assert_is_compatible_with(expected_shape)
 
   def _TestGetItemException(self, rt, slice_spec, expected, message):
     """Helper function for testing RaggedTensor.__getitem__ exceptions."""
-    tensor_slice_spec = _make_tensor_slice_spec(slice_spec, True)
-    with self.assertRaisesRegexp(expected, message):
-      self.evaluate(rt.__getitem__(slice_spec))
-    with self.assertRaisesRegexp(expected, message):
-      self.evaluate(rt.__getitem__(tensor_slice_spec))
+    tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
+    self.assertRaisesRegexp(expected, message, rt.__getitem__, slice_spec)
+    self.assertRaisesRegexp(expected, message, rt.__getitem__,
+                            tensor_slice_spec1)
 
   @parameterized.parameters(
       # Tests for rt[i]
@@ -1237,84 +1225,12 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     self.assertEqual(rt_newaxis3.ragged_rank, 2)
     self.assertEqual(rt_newaxis4.ragged_rank, 2)
 
-    self.assertEqual(rt_newaxis0.shape.as_list(), [1, 2, None, None, 2])
-    self.assertEqual(rt_newaxis1.shape.as_list(), [2, 1, None, None, 2])
-    self.assertEqual(rt_newaxis2.shape.as_list(), [2, None, 1, None, 2])
+    self.assertEqual(rt_newaxis0.shape.as_list(), [1, None, None, None, 2])
+    self.assertEqual(rt_newaxis1.shape.as_list(), [2, None, None, None, 2])
+    self.assertEqual(rt_newaxis2.shape.as_list(), [2, None, None, None, 2])
     self.assertEqual(rt_newaxis3.shape.as_list(), [2, None, None, 1, 2])
     self.assertEqual(rt_newaxis4.shape.as_list(), [2, None, None, 2, 1])
 
-  @parameterized.parameters(
-      # EXAMPLE_RAGGED_TENSOR_3D.shape = [2, 3, None]
-
-      # Indexing into uniform_row_splits dimension:
-      (SLICE_BUILDER[:, 1], [r[1] for r in EXAMPLE_RAGGED_TENSOR_3D],
-       [2, None]),
-      (SLICE_BUILDER[:, 2], [r[2] for r in EXAMPLE_RAGGED_TENSOR_3D],
-       [2, None]),
-      (SLICE_BUILDER[:, -2], [r[-2] for r in EXAMPLE_RAGGED_TENSOR_3D],
-       [2, None]),
-      (SLICE_BUILDER[:, -3], [r[-3] for r in EXAMPLE_RAGGED_TENSOR_3D],
-       [2, None]),
-      (SLICE_BUILDER[1:, 2], [r[2] for r in EXAMPLE_RAGGED_TENSOR_3D[1:]],
-       [1, None]),
-      (SLICE_BUILDER[:, 1, 1:], [r[1][1:] for r in EXAMPLE_RAGGED_TENSOR_3D],
-       [2, None]),
-      (SLICE_BUILDER[1:, 1, 1:],
-       [r[1][1:] for r in EXAMPLE_RAGGED_TENSOR_3D[1:]],
-       [1, None]),
-
-      # Slicing uniform_row_splits dimension:
-      (SLICE_BUILDER[:, 2:], [r[2:] for r in EXAMPLE_RAGGED_TENSOR_3D],
-       [2, 1, None]),
-      (SLICE_BUILDER[:, -2:], [r[-2:] for r in EXAMPLE_RAGGED_TENSOR_3D],
-       [2, 2, None]),
-      (SLICE_BUILDER[:, :, 1:],
-       [[c[1:] for c in r] for r in EXAMPLE_RAGGED_TENSOR_3D],
-       [2, 3, None]),
-      (SLICE_BUILDER[:, 5:], [r[5:] for r in EXAMPLE_RAGGED_TENSOR_3D],
-       [2, 0, None]),
-
-      # Slicing uniform_row_splits dimension with a non-default step size:
-      (SLICE_BUILDER[:, ::2], [r[::2] for r in EXAMPLE_RAGGED_TENSOR_3D],
-       [2, 2, None]),
-      (SLICE_BUILDER[:, ::-1], [r[::-1] for r in EXAMPLE_RAGGED_TENSOR_3D],
-       [2, 3, None]),
-  )
-  def testRaggedTensorGetItemWithUniformRowLength(self, slice_spec, expected,
-                                                  expected_shape):
-    """Test that rt.__getitem__(slice_spec) == expected."""
-    rt = RaggedTensor.from_uniform_row_length(
-        RaggedTensor.from_row_splits(
-            EXAMPLE_RAGGED_TENSOR_3D_VALUES,
-            EXAMPLE_RAGGED_TENSOR_3D_SPLITS),
-        EXAMPLE_RAGGED_TENSOR_3D_ROWLEN)
-    self.assertAllEqual(rt, EXAMPLE_RAGGED_TENSOR_3D)
-    self.assertIsNot(rt.uniform_row_length, None)
-    self._TestGetItem(rt, slice_spec, expected, expected_shape)
-
-    # If the result is 3D, then check that it still has a uniform row length:
-    actual = rt.__getitem__(slice_spec)
-    if actual.shape.rank == 3:
-      self.assertIsNot(actual.uniform_row_length, None)
-      self.assertAllEqual(actual.uniform_row_length, expected_shape[1])
-
-  @parameterized.parameters(
-      (SLICE_BUILDER[:, 3], errors.InvalidArgumentError, 'out of bounds'),
-      (SLICE_BUILDER[:, -4], errors.InvalidArgumentError, 'out of bounds'),
-      (SLICE_BUILDER[:, 10], errors.InvalidArgumentError, 'out of bounds'),
-      (SLICE_BUILDER[:, -10], errors.InvalidArgumentError, 'out of bounds'),
-  )
-  def testRaggedTensorGetItemErrorsWithUniformRowLength(self, slice_spec,
-                                                        expected, message):
-    """Test that rt.__getitem__(slice_spec) == expected."""
-    rt = RaggedTensor.from_uniform_row_length(
-        RaggedTensor.from_row_splits(
-            EXAMPLE_RAGGED_TENSOR_3D_VALUES,
-            EXAMPLE_RAGGED_TENSOR_3D_SPLITS),
-        EXAMPLE_RAGGED_TENSOR_3D_ROWLEN)
-    self.assertAllEqual(rt, EXAMPLE_RAGGED_TENSOR_3D)
-    self._TestGetItemException(rt, slice_spec, expected, message)
-
   #=============================================================================
   # RaggedTensor.__str__
   #=============================================================================

From e0fe1b1949feca9eabe25297f620dfe1d05e6aec Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Tue, 18 Feb 2020 15:14:22 -0800
Subject: [PATCH 184/442] Use GetDataDependencyFilepath and JoinPath to
 reference llvm-project/llvm/FileCheck

This enables the code to work on windows as well.

PiperOrigin-RevId: 295833234
Change-Id: I4dff88fde871eb799e8a9413d66dd7b74c81394f
---
 tensorflow/compiler/xla/tests/BUILD        |  1 +
 tensorflow/compiler/xla/tests/filecheck.cc | 12 ++++--------
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 68c5538b1db..540a63405ef 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -262,6 +262,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "//tensorflow/core/platform:resource_loader",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/tests/filecheck.cc b/tensorflow/compiler/xla/tests/filecheck.cc
index 91d1052fc64..5926ebece39 100644
--- a/tensorflow/compiler/xla/tests/filecheck.cc
+++ b/tensorflow/compiler/xla/tests/filecheck.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/platform/subprocess.h"
 
 namespace xla {
@@ -39,14 +41,8 @@ StatusOr<bool> RunFileCheck(const std::string& input,
   TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(env, pattern_path, pattern));
 
   // Invoke FileCheck to check whether input matches `pattern`.
-  const char* file_check_path_suffix =
-      "org_tensorflow/external/llvm-project/llvm/FileCheck";
-  string file_check_path;
-  if (const char* test_srcdir = getenv("TEST_SRCDIR")) {
-    file_check_path = JoinPath(test_srcdir, file_check_path_suffix);
-  } else {
-    file_check_path = file_check_path_suffix;
-  }
+  string file_check_path = tensorflow::GetDataDependencyFilepath(
+      JoinPath("external", "llvm-project", "llvm", "FileCheck"));
 
   tensorflow::SubProcess file_check_process;
   file_check_process.SetProgram(

From f8822b0a55b76aa53846eb55cfd5f2737ce28829 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 15:30:23 -0800
Subject: [PATCH 185/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 295837032 Change-Id:
 Id1a8271fe95e188c8308aeb227ec52d23c848642

---
 tensorflow/go/op/wrappers.go | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index ffa9931d561..c744d5b466a 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -20516,6 +20516,14 @@ func FusedBatchNormV2Epsilon(value float32) FusedBatchNormV2Attr {
 	}
 }
 
+// FusedBatchNormV2ExponentialAvgFactor sets the optional exponential_avg_factor attribute to value.
+// If not specified, defaults to 1
+func FusedBatchNormV2ExponentialAvgFactor(value float32) FusedBatchNormV2Attr {
+	return func(m optionalAttr) {
+		m["exponential_avg_factor"] = value
+	}
+}
+
 // FusedBatchNormV2DataFormat sets the optional data_format attribute to value.
 //
 // value: The data format for x and y. Either "NHWC" (default) or "NCHW".
@@ -20783,6 +20791,14 @@ func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
 	}
 }
 
+// FusedBatchNormExponentialAvgFactor sets the optional exponential_avg_factor attribute to value.
+// If not specified, defaults to 1
+func FusedBatchNormExponentialAvgFactor(value float32) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["exponential_avg_factor"] = value
+	}
+}
+
 // FusedBatchNormDataFormat sets the optional data_format attribute to value.
 //
 // value: The data format for x and y. Either "NHWC" (default) or "NCHW".
@@ -34194,6 +34210,14 @@ func FusedBatchNormV3Epsilon(value float32) FusedBatchNormV3Attr {
 	}
 }
 
+// FusedBatchNormV3ExponentialAvgFactor sets the optional exponential_avg_factor attribute to value.
+// If not specified, defaults to 1
+func FusedBatchNormV3ExponentialAvgFactor(value float32) FusedBatchNormV3Attr {
+	return func(m optionalAttr) {
+		m["exponential_avg_factor"] = value
+	}
+}
+
 // FusedBatchNormV3DataFormat sets the optional data_format attribute to value.
 //
 // value: The data format for x and y. Either "NHWC" (default) or "NCHW".

From 9189ce99fc88edb2af11ffcf93fde630f94366f7 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Tue, 18 Feb 2020 15:33:42 -0800
Subject: [PATCH 186/442] Improve the completeness of the CFG by drawing edges
 from raise statements to all enclosing except blocks.

PiperOrigin-RevId: 295837876
Change-Id: I34e6ad8eb50e984fd526948d66ceaaf27c3b453a
---
 tensorflow/python/autograph/pyct/cfg.py      |  76 ++++++++++---
 tensorflow/python/autograph/pyct/cfg_test.py | 112 ++++++++++++++++++-
 2 files changed, 167 insertions(+), 21 deletions(-)

diff --git a/tensorflow/python/autograph/pyct/cfg.py b/tensorflow/python/autograph/pyct/cfg.py
index 71145802ed9..194c39802db 100644
--- a/tensorflow/python/autograph/pyct/cfg.py
+++ b/tensorflow/python/autograph/pyct/cfg.py
@@ -21,13 +21,14 @@ a corresponding CFG counterpart.
 Once built, the CFG itself is immutable, but the values it holds need not be;
 they are usually annotated with information extracted by walking the graph.
 
-Note: the CFG tries to include all code paths that MAY be taken, with the
-follwing exceptions:
+Tip: Use `Graph.as_dot` to visualize the CFG using any DOT viewer.
+
+Note: the CFG tries to include all code paths that MAY be taken, with a single
+notable exception:
  * function calls do not generate edges corresponding to exceptions they may
-   raise (i.e. a function call in the middle of a block does not exit or jump
-   to an except block)
- * raise never generates an edge to an except block
-(TODO:mdan): Remove this last bullet.
+   raise (i.e. a function call in the middle of a block does not return or jump
+   to any except or finally block)
+TODO(mdan): Consider adding the edges above. They'd only add ~O(n) edges.
 """
 
 # TODO(mdan): The notion of 'statements' below is inaccurate.
@@ -309,6 +310,9 @@ class GraphBuilder(object):
     # Continue jumps keyed by the section they affect.
     self.continues = {}
 
+    # Raise jumps keyed by the except section guarding them.
+    self.raises = {}
+
     # The entry of conditional sections, keyed by the section.
     self.cond_entry = {}
     # Lists of leaf nodes corresponding to each branch in the section.
@@ -429,9 +433,12 @@ class GraphBuilder(object):
       section_id: Hashable, the node for which ast_node should be considered
           to be an exit node
       guards: Tuple[ast.AST, ...], the finally sections that guard ast_node
+    Returns:
+      Node
     """
     node = self._add_jump_node(ast_node, guards)
     self.exits[section_id].add(node)
+    return node
 
   def add_continue_node(self, ast_node, section_id, guards):
     """Grows the graph by adding a reentry node.
@@ -447,6 +454,21 @@ class GraphBuilder(object):
     node = self._add_jump_node(ast_node, guards)
     self.continues[section_id].add(node)
 
+  def connect_raise_node(self, node, except_guards):
+    """Adds extra connection between a raise node and containing except guards.
+
+    The node is a graph node, not an ast node.
+
+    Args:
+      node: Node
+      except_guards: Tuple[ast.AST, ...], the except sections that guard node
+    """
+    for guard in except_guards:
+      if guard in self.raises:
+        self.raises[guard].append(node)
+      else:
+        self.raises[guard] = [node]
+
   def enter_section(self, section_id):
     """Enters a regular section.
 
@@ -537,6 +559,11 @@ class GraphBuilder(object):
     del self.cond_entry[section_id]
     del self.cond_leaves[section_id]
 
+  def enter_except_section(self, section_id):
+    """Enters an except section."""
+    if section_id in self.raises:
+      self.leaves.update(self.raises[section_id])
+
   def enter_finally_section(self, section_id):
     """Enters a finally section."""
     # TODO(mdan): This, not the caller, should track the active sections.
@@ -636,18 +663,31 @@ class AstToCfg(gast.NodeVisitor):
         return node, included
     return None, included
 
+  def _get_enclosing_except_scopes(self, stop_at):
+    included = []
+    for node in reversed(self.lexical_scopes):
+      if isinstance(node, gast.Try) and node.handlers:
+        included.extend(node.handlers)
+      if isinstance(node, stop_at):
+        break
+    return included
+
   def _process_basic_statement(self, node):
     self.generic_visit(node)
     self.builder.add_ordinary_node(node)
 
-  def _process_exit_statement(self, node, *exits_nodes_of_type):
+  def _process_exit_statement(
+      self, node, exits_nodes_of_type, may_exit_via_except=False):
     # Note: this is safe because we process functions separately.
-    try_node, guards = self._get_enclosing_finally_scopes(
-        tuple(exits_nodes_of_type))
-    if try_node is None:
-      raise ValueError(
-          '%s that is not enclosed by any of %s' % (node, exits_nodes_of_type))
-    self.builder.add_exit_node(node, try_node, guards)
+    try_node, guards = self._get_enclosing_finally_scopes(exits_nodes_of_type)
+    assert try_node is not None, '{} that is not enclosed by any of {}'.format(
+        node, exits_nodes_of_type)
+
+    node = self.builder.add_exit_node(node, try_node, guards)
+
+    if may_exit_via_except:
+      except_guards = self._get_enclosing_except_scopes(exits_nodes_of_type)
+      self.builder.connect_raise_node(node, except_guards)
 
   def _process_continue_statement(self, node, *loops_to_nodes_of_type):
     # Note: this is safe because we process functions separately.
@@ -711,7 +751,7 @@ class AstToCfg(gast.NodeVisitor):
     self.builder = self.builder_stack.pop()
 
   def visit_Return(self, node):
-    self._process_exit_statement(node, gast.FunctionDef)
+    self._process_exit_statement(node, (gast.FunctionDef,))
 
   def visit_Expr(self, node):
     self._process_basic_statement(node)
@@ -738,7 +778,8 @@ class AstToCfg(gast.NodeVisitor):
     self._process_basic_statement(node)
 
   def visit_Raise(self, node):
-    self._process_exit_statement(node, gast.FunctionDef)
+    self._process_exit_statement(
+        node, (gast.FunctionDef,), may_exit_via_except=True)
     self.builder.errors.add(node)
 
   def visit_Assert(self, node):
@@ -818,13 +859,14 @@ class AstToCfg(gast.NodeVisitor):
     self.builder.end_statement(node)
 
   def visit_Break(self, node):
-    self._process_exit_statement(node, gast.While, gast.For)
+    self._process_exit_statement(node, (gast.While, gast.For,))
 
   def visit_Continue(self, node):
-    self._process_continue_statement(node, gast.While, gast.For)
+    self._process_continue_statement(node, (gast.While, gast.For,))
 
   def visit_ExceptHandler(self, node):
     self.builder.begin_statement(node)
+    self.builder.enter_except_section(node)
 
     if node.type is not None:
       self.visit(node.type)
diff --git a/tensorflow/python/autograph/pyct/cfg_test.py b/tensorflow/python/autograph/pyct/cfg_test.py
index 06fa0732455..7eee2504cf3 100644
--- a/tensorflow/python/autograph/pyct/cfg_test.py
+++ b/tensorflow/python/autograph/pyct/cfg_test.py
@@ -1309,21 +1309,125 @@ class AstToCfgTest(test.TestCase):
         graph,
         (
             ('a, b', '(a > 0)', ('raise b', 'return 0')),
-            ('(a > 0)', 'raise b', None),
+            ('(a > 0)', 'raise b', 'return 1'),
             ('(a > 0)', 'return 0', None),
-            (None, 'return 1', None),
+            ('raise b', 'return 1', None),
         ),
     )
     self.assertStatementEdges(
         graph,
         (
             ('a, b', 'Try:2', None),
-            ('a, b', 'If:3', None),
-            (None, 'ExceptHandler:7', None),
+            ('a, b', 'If:3', 'return 1'),
+            ('raise b', 'ExceptHandler:7', None),
         ),
     )
     self.assertGraphEnds(graph, 'a, b', ('return 0', 'return 1', 'raise b'))
 
+  def test_raise_exits(self):
+
+    def test_fn(a, b):
+      raise b
+      return a  # pylint:disable=unreachable
+
+    graph, = self._build_cfg(test_fn).values()
+
+    self.assertGraphMatches(
+        graph,
+        (
+            ('a, b', 'raise b', None),
+            (None, 'return a', None),
+        ),
+    )
+    self.assertGraphEnds(graph, 'a, b', ('raise b', 'return a'))
+
+  def test_raise_triggers_enclosing_finally(self):
+
+    def test_fn(a):
+      try:
+        try:
+          raise a
+          return 1  # pylint:disable=unreachable
+        finally:
+          b = 1
+        return 2
+      finally:
+        b = 2
+      return b
+
+    graph, = self._build_cfg(test_fn).values()
+
+    self.assertGraphMatches(
+        graph,
+        (
+            ('a', 'raise a', 'b = 1'),
+            (('raise a', 'return 1'), 'b = 1', 'b = 2'),
+            (None, 'return 1', 'b = 1'),
+            (None, 'return 2', 'b = 2'),
+            (('return 2', 'b = 1'), 'b = 2', None),
+            (None, 'return b', None),
+        ),
+    )
+    self.assertGraphEnds(
+        graph, 'a', ('return b', 'b = 2'))
+
+  def test_raise_adds_finally_sortcuts(self):
+
+    def test_fn(a):
+      try:
+        try:
+          if a > 0:
+            raise a
+          c = 1
+        finally:
+          b = 1
+        c = 2
+      finally:
+        b = 2
+      return b, c
+
+    graph, = self._build_cfg(test_fn).values()
+
+    self.assertGraphMatches(
+        graph,
+        (
+            ('a', '(a > 0)', ('raise a', 'c = 1')),
+            ('(a > 0)', 'raise a', 'b = 1'),
+            ('(a > 0)', 'c = 1', 'b = 1'),
+            (('raise a', 'c = 1'), 'b = 1', ('c = 2', 'b = 2')),
+            ('b = 1', 'c = 2', 'b = 2'),
+            (('b = 1', 'c = 2'), 'b = 2', 'return (b, c)'),
+            ('b = 2', 'return (b, c)', None),
+        ),
+    )
+    self.assertGraphEnds(
+        graph, 'a', ('return (b, c)', 'b = 2'))
+
+  def test_raise_exits_via_except(self):
+
+    def test_fn(a, b):
+      try:
+        raise b
+      except a:
+        c = 1
+      except b:
+        c = 2
+      finally:
+        c += 3
+
+    graph, = self._build_cfg(test_fn).values()
+
+    self.assertGraphMatches(
+        graph,
+        (
+            ('a, b', 'raise b', ('c = 1', 'c = 2', 'c += 3')),
+            ('raise b', 'c = 1', 'c += 3'),
+            ('raise b', 'c = 2', 'c += 3'),
+            (('raise b', 'c = 1', 'c = 2'), 'c += 3', None),
+        ),
+    )
+    self.assertGraphEnds(graph, 'a, b', ('c += 3',))
+
   def test_list_comprehension(self):
 
     def test_fn(a):

From 1519ef5c6a92b0c397b3c95e3646f1d8e0b6a678 Mon Sep 17 00:00:00 2001
From: Youlong Cheng <ylc@google.com>
Date: Tue, 18 Feb 2020 15:34:25 -0800
Subject: [PATCH 187/442] Refactor xla_sharding to be more useful.

PiperOrigin-RevId: 295838039
Change-Id: Ia138c41a9e2739379ecf3e2222686a195b0fe56d
---
 tensorflow/compiler/tf2xla/sharding_util.cc | 31 ++++++++++-----------
 tensorflow/compiler/tf2xla/sharding_util.h  |  4 +++
 2 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/sharding_util.cc b/tensorflow/compiler/tf2xla/sharding_util.cc
index 4d5bf0835e1..366e8d49228 100644
--- a/tensorflow/compiler/tf2xla/sharding_util.cc
+++ b/tensorflow/compiler/tf2xla/sharding_util.cc
@@ -26,22 +26,6 @@ const char kShardingAttribute[] = "_XlaSharding";
 }  // namespace
 
 namespace {
-xla::StatusOr<absl::optional<xla::OpSharding>> GetShardingFromNodeDef(
-    const NodeDef& node_def) {
-  if (!HasNodeAttr(node_def, kShardingAttribute)) {
-    return absl::optional<xla::OpSharding>();
-  }
-  string value;
-  xla::OpSharding sharding;
-  TF_RETURN_IF_ERROR(GetNodeAttr(node_def, kShardingAttribute, &value));
-  if (!sharding.ParseFromString(value)) {
-    return xla::InvalidArgument(
-        "Experimental _XlaSharding attribute was not a valid encoded "
-        "xla::OpSharding proto.");
-  }
-  return absl::optional<xla::OpSharding>(sharding);
-}
-
 Status CoreOutOfRangeError(int core, int num_cores_per_replica) {
   return errors::InvalidArgument(
       "Invalid replicated core id: ", core,
@@ -107,4 +91,19 @@ void SetShardingDeviceAssignmentFromNode(const Node& src, Node* dst) {
   }
 }
 
+xla::StatusOr<absl::optional<xla::OpSharding>> GetShardingFromNodeDef(
+    const NodeDef& node_def) {
+  if (!HasNodeAttr(node_def, kShardingAttribute)) {
+    return absl::optional<xla::OpSharding>();
+  }
+  string value;
+  xla::OpSharding sharding;
+  TF_RETURN_IF_ERROR(GetNodeAttr(node_def, kShardingAttribute, &value));
+  if (!sharding.ParseFromString(value)) {
+    return xla::InvalidArgument(
+        "Experimental _XlaSharding attribute was not a valid encoded "
+        "xla::OpSharding proto.");
+  }
+  return absl::optional<xla::OpSharding>(sharding);
+}
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/sharding_util.h b/tensorflow/compiler/tf2xla/sharding_util.h
index ab67d4f1542..196434826f9 100644
--- a/tensorflow/compiler/tf2xla/sharding_util.h
+++ b/tensorflow/compiler/tf2xla/sharding_util.h
@@ -45,6 +45,10 @@ xla::StatusOr<absl::optional<xla::OpSharding>> ParseShardingFromDevice(
 
 void SetShardingDeviceAssignmentFromNode(const Node& src, Node* dst);
 
+// Get sharding inforamtion from node.
+xla::StatusOr<absl::optional<xla::OpSharding>> GetShardingFromNodeDef(
+    const NodeDef& node_def);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_SHARDING_UTIL_H_

From ff30c17039f2b77e806ed9fb19a78e9dcd7cf4ed Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Tue, 18 Feb 2020 15:38:07 -0800
Subject: [PATCH 188/442] Add sparse_out support for CategoryEncoding layer.

PiperOrigin-RevId: 295838860
Change-Id: I33d2ecf132bc3ff2620a292a5f8725212f9343c2
---
 .../preprocessing/categorical_encoding.py     |  51 ++++++-
 .../categorical_encoding_test.py              | 134 +++++++++++++++++-
 2 files changed, 182 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_encoding.py b/tensorflow/python/keras/layers/preprocessing/categorical_encoding.py
index e61b3cb6b65..0bd011646f8 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_encoding.py
+++ b/tensorflow/python/keras/layers/preprocessing/categorical_encoding.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import compat
 
 TFIDF = "tf-idf"
@@ -68,10 +69,16 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
           of times the token at that index appeared in the batch item.
         "tf-idf": As "binary", but the TF-IDF algorithm is applied to find the
           value in each token slot.
+    sparse: Boolean. If true, returns a `SparseTensor` instead of a dense
+      `Tensor`. Defaults to `False`.
   """
   # TODO(momernick): Add an examples section to the docstring.
 
-  def __init__(self, max_tokens=None, output_mode=COUNT, **kwargs):
+  def __init__(self,
+               max_tokens=None,
+               output_mode=COUNT,
+               sparse=False,
+               **kwargs):
     # 'output_mode' must be one of (COUNT, BINARY, TFIDF)
     layer_utils.validate_string_arg(
         output_mode,
@@ -92,6 +99,7 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
 
     self._max_tokens = max_tokens
     self._output_mode = output_mode
+    self._sparse = sparse
     self._called = False
 
     # This layer supports RaggedTensor inputs.
@@ -130,7 +138,11 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
   def compute_output_signature(self, input_spec):
     output_shape = self.compute_output_shape(input_spec.shape.as_list())
     output_dtype = K.floatx() if self._output_mode == TFIDF else dtypes.int64
-    return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
+    if self._sparse:
+      return sparse_tensor.SparseTensorSpec(
+          shape=output_shape, dtype=output_dtype)
+    else:
+      return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
 
   def adapt(self, data, reset_state=True):
     """Fits the state of the preprocessing layer to the dataset.
@@ -169,6 +181,7 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
     config = {
         "max_tokens": self._max_tokens,
         "output_mode": self._output_mode,
+        "sparse": self._sparse,
     }
     base_config = super(CategoricalEncoding, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -179,6 +192,18 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
     else:
       return np.array(x)
 
+  def _convert_to_sparse_inputs(self, inputs):
+    if isinstance(inputs, sparse_tensor.SparseTensor):
+      return inputs
+    elif isinstance(inputs, ragged_tensor.RaggedTensor):
+      return inputs.to_sparse()
+    else:
+      indices = array_ops.where_v2(
+          math_ops.greater_equal(inputs, array_ops.constant(0, inputs.dtype)))
+      values = array_ops.gather_nd(inputs, indices)
+      shape = array_ops.shape(inputs, out_type=dtypes.int64)
+      return sparse_tensor.SparseTensor(indices, values, shape)
+
   def set_num_elements(self, num_elements):
     if self._max_tokens is not None:
       raise RuntimeError(
@@ -215,6 +240,28 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
     else:
       out_depth = self._max_tokens
 
+    if self._sparse:
+      if self._output_mode != COUNT:
+        raise ValueError("Only supports `sparse=True` when `output_mode` "
+                         ' is \"count\", got {}'.format(self._output_mode))
+      inputs = self._convert_to_sparse_inputs(inputs)
+
+      # Consider having sparse.one_hot
+      # Append values to indices, and reduce sum to get the counts.
+      tokens = array_ops.expand_dims(
+          math_ops.cast(inputs.values, dtypes.int64), axis=1)
+      count_tokens = array_ops.concat([inputs.indices, tokens], axis=1)
+      count_values = array_ops.ones_like(inputs.values, dtype=dtypes.int64)
+      unreduced_count_shape = array_ops.concat(
+          [inputs.dense_shape, [out_depth]], axis=0)
+      counts = sparse_tensor.SparseTensor(
+          indices=count_tokens,
+          values=count_values,
+          dense_shape=unreduced_count_shape)
+      count_data = sparse_ops.sparse_reduce_sum_v2(
+          counts, axis=1, output_is_sparse=True)
+      return count_data
+
     # If the input is a sparse tensor, we densify it with the default value of
     # -1. Because -1 is ignored by one_hot, this effectively drops the non-set
     # positions from the output encoding.
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_encoding_test.py b/tensorflow/python/keras/layers/preprocessing/categorical_encoding_test.py
index 7608f8715b5..e21e95a0078 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_encoding_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/categorical_encoding_test.py
@@ -26,14 +26,18 @@ from tensorflow.python import keras
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.layers.preprocessing import categorical_encoding
 from tensorflow.python.keras.layers.preprocessing import categorical_encoding_v1
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 
@@ -45,11 +49,46 @@ def get_layer_class():
     return categorical_encoding_v1.CategoricalEncoding
 
 
-@keras_parameterized.run_all_keras_modes
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
 class CategoricalEncodingInputTest(
     keras_parameterized.TestCase,
     preprocessing_test_utils.PreprocessingLayerTest):
 
+  def test_dense_input_sparse_output(self):
+    input_array = constant_op.constant([[1, 2, 3], [3, 3, 0]])
+
+    # The expected output should be (X for missing value):
+    # [[X, 1, 1, 1]
+    #  [1, X, X, X]
+    #  [X, X, X, 2]]
+    expected_indices = [[0, 1], [0, 2], [0, 3], [1, 0], [1, 3]]
+    expected_values = [1, 1, 1, 1, 2]
+    max_tokens = 6
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
+    layer = get_layer_class()(
+        max_tokens=max_tokens,
+        output_mode=categorical_encoding.COUNT,
+        sparse=True)
+    int_data = layer(input_data)
+
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    sp_output_dataset = model.predict(input_array, steps=1)
+    self.assertAllEqual(expected_values, sp_output_dataset.values)
+    self.assertAllEqual(expected_indices, sp_output_dataset.indices)
+
+    # Assert sparse output is same as dense output.
+    layer = get_layer_class()(
+        max_tokens=max_tokens,
+        output_mode=categorical_encoding.COUNT,
+        sparse=False)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array, steps=1)
+    self.assertAllEqual(
+        sparse_ops.sparse_tensor_to_dense(sp_output_dataset, default_value=0),
+        output_dataset)
+
   def test_sparse_input(self):
     input_array = np.array([[1, 2, 3, 0], [0, 3, 1, 0]], dtype=np.int64)
     sparse_tensor_data = sparse_ops.from_dense(input_array)
@@ -72,6 +111,45 @@ class CategoricalEncodingInputTest(
     output_dataset = model.predict(sparse_tensor_data, steps=1)
     self.assertAllEqual(expected_output, output_dataset)
 
+  def test_sparse_input_sparse_output(self):
+    sp_inp = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 1], [2, 0], [2, 1], [3, 1]],
+        values=[0, 2, 1, 1, 0],
+        dense_shape=[4, 2])
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
+
+    # The expected output should be (X for missing value):
+    # [[1, X, X, X]
+    #  [X, X, 1, X]
+    #  [X, 2, X, X]
+    #  [1, X, X, X]]
+    expected_indices = [[0, 0], [1, 2], [2, 1], [3, 0]]
+    expected_values = [1, 1, 2, 1]
+    max_tokens = 6
+
+    layer = get_layer_class()(
+        max_tokens=max_tokens,
+        output_mode=categorical_encoding.COUNT,
+        sparse=True)
+    int_data = layer(input_data)
+
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    sp_output_dataset = model.predict(sp_inp, steps=1)
+    self.assertAllEqual(expected_values, sp_output_dataset.values)
+    self.assertAllEqual(expected_indices, sp_output_dataset.indices)
+
+    # Assert sparse output is same as dense output.
+    layer = get_layer_class()(
+        max_tokens=max_tokens,
+        output_mode=categorical_encoding.COUNT,
+        sparse=False)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(sp_inp, steps=1)
+    self.assertAllEqual(
+        sparse_ops.sparse_tensor_to_dense(sp_output_dataset, default_value=0),
+        output_dataset)
+
   def test_ragged_input(self):
     input_array = ragged_factory_ops.constant([[1, 2, 3], [3, 1]])
 
@@ -94,6 +172,60 @@ class CategoricalEncodingInputTest(
     output_dataset = model.predict(input_array, steps=1)
     self.assertAllEqual(expected_output, output_dataset)
 
+  def test_ragged_input_sparse_output(self):
+    input_array = ragged_factory_ops.constant([[1, 2, 3], [3, 3]])
+
+    # The expected output should be (X for missing value):
+    # [[X, 1, 1, 1]
+    #  [X, X, X, 2]]
+    expected_indices = [[0, 1], [0, 2], [0, 3], [1, 3]]
+    expected_values = [1, 1, 1, 2]
+    max_tokens = 6
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
+    layer = get_layer_class()(
+        max_tokens=max_tokens,
+        output_mode=categorical_encoding.COUNT,
+        sparse=True)
+    int_data = layer(input_data)
+
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    sp_output_dataset = model.predict(input_array, steps=1)
+    self.assertAllEqual(expected_values, sp_output_dataset.values)
+    self.assertAllEqual(expected_indices, sp_output_dataset.indices)
+
+    # Assert sparse output is same as dense output.
+    layer = get_layer_class()(
+        max_tokens=max_tokens,
+        output_mode=categorical_encoding.COUNT,
+        sparse=False)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array, steps=1)
+    self.assertAllEqual(
+        sparse_ops.sparse_tensor_to_dense(sp_output_dataset, default_value=0),
+        output_dataset)
+
+  # Keras functional model doesn't support dense layer stacked with sparse out.
+  def DISABLED_test_sparse_output_and_dense_layer(self):
+    input_array = constant_op.constant([[1, 2, 3], [3, 3, 0]])
+
+    max_tokens = 4
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
+    encoding_layer = get_layer_class()(
+        max_tokens=max_tokens,
+        output_mode=categorical_encoding.COUNT,
+        sparse=True)
+    int_data = encoding_layer(input_data)
+    output_data = math_ops.cast(int_data, dtypes.float32)
+    weights = variables.Variable([[.1], [.2], [.3], [.4]], dtype=dtypes.float32)
+    weights_mult = lambda x: sparse_ops.sparse_tensor_dense_matmul(x, weights)
+    output_data = keras.layers.Lambda(weights_mult)(output_data)
+
+    model = keras.Model(inputs=input_data, outputs=output_data)
+    _ = model.predict(input_array, steps=1)
+
 
 @keras_parameterized.run_all_keras_modes
 class CategoricalEncodingAdaptTest(

From 959c75a200fc2a26568563c03a366102d83e30da Mon Sep 17 00:00:00 2001
From: Sean Silva <silvasean@google.com>
Date: Tue, 18 Feb 2020 16:08:24 -0800
Subject: [PATCH 189/442] Fix tf-shape-inference when return op is in a
 different block.

PiperOrigin-RevId: 295845982
Change-Id: I31a531f3ae1b7540fb5c92d41212fd5131255542
---
 .../tensorflow/tests/shape_inference.mlir     | 11 +++++++++++
 .../tensorflow/transforms/shape_inference.cc  | 19 +++++++++++++------
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index 23cc06de453..c9db7e0a1dc 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -45,6 +45,17 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     return %1 : tensor<*xf32>
   }
 
+// CHECK-LABEL: func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<?xf32>
+func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
+  br ^bb1
+^bb1:
+// CHECK: %[[IDENTITY:.*]] = "tf.Identity"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK: return %[[IDENTITY]] : tensor<?xf32>
+  %ret = "tf.Identity"(%arg0) : (tensor<?xf32>) -> tensor<*xf32>
+  return %ret : tensor<*xf32>
+}
+
+
 // Tests the case where an inference opportunity relies on folding.
 
 // CHECK-LABEL: func @simple_folding
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index fd485d17374..c44f0f97fd6 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -60,16 +60,23 @@ namespace TF {
 namespace {
 Optional<llvm::SmallVector<mlir::Type, 4>> InferShapeForFunctionReturnType(
     FuncOp func) {
-  // Only infer shape when there is one return op for now.
-  if (!has_single_element(func.getBody()) || func.front().empty()) {
+  // Find any return ops.
+  SmallVector<ReturnOp, 4> return_ops;
+  for (Block& block : func) {
+    if (auto return_op = dyn_cast<ReturnOp>(block.getTerminator())) {
+      return_ops.push_back(return_op);
+    }
+  }
+
+  // Right now we only handle the case of a single return op.
+  // To handle multiple return ops, we would need to look at all their shapes
+  // and come up with a common shape and insert appropriate casts.
+  if (return_ops.size() != 1) {
     return None;
   }
 
   // Find the return type.
-  auto return_op = dyn_cast<mlir::ReturnOp>(func.front().back());
-  if (!return_op) {
-    return None;
-  }
+  auto return_op = return_ops.front();
 
   // Manually fold tf.Cast that precedes the return instruction and only differs
   // in shape refinement level.

From 634829348f50d661c923a16ba50be83d37530a87 Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Tue, 18 Feb 2020 16:09:59 -0800
Subject: [PATCH 190/442] Make devices property on DistributedValue private.

PiperOrigin-RevId: 295846322
Change-Id: I3c3da742aef1beb547a2aba98ff12ed26d275487
---
 .../python/distribute/cross_device_ops.py     | 10 ++--
 .../distribute/cross_device_ops_test.py       |  6 +-
 .../python/distribute/mirrored_strategy.py    |  2 +-
 .../distribute/mirrored_strategy_test.py      | 12 ++--
 .../distribute/mirrored_variable_test.py      | 58 +++++++++++++------
 .../distribute/parameter_server_strategy.py   |  2 +-
 tensorflow/python/distribute/tpu_strategy.py  |  2 +-
 tensorflow/python/distribute/values.py        |  6 +-
 8 files changed, 59 insertions(+), 39 deletions(-)

diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index 9d44f5c554c..7f6230e9404 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -154,7 +154,7 @@ def _validate_value_destination_pairs(value_destination_pairs):
 # CrossDeviceOps.
 def get_devices_from(destinations):
   if isinstance(destinations, value_lib.DistributedValues):
-    return destinations.devices
+    return destinations._devices  # pylint: disable=protected-access
   elif isinstance(destinations, six.string_types):
     return (device_util.resolve(destinations),)
   return (device_util.resolve(destinations.device),)
@@ -441,12 +441,12 @@ def _group_value_by_device(per_replica_values):
     a list of lists, each sublist has components for its corresponding device of
       PerReplica objects, paired with a None.
   """
-  destinations = per_replica_values[0].devices
+  destinations = per_replica_values[0]._devices  # pylint: disable=protected-access
   grouped = [[] for _ in range(len(destinations))]
   for per_replica_value in per_replica_values:
     # pylint: disable=protected-access
     for i, v in enumerate(per_replica_value.values):
-      assert per_replica_value.devices == destinations
+      assert per_replica_value._devices == destinations
       grouped[i].append((v, None))
   return grouped
 
@@ -730,7 +730,7 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
         (len(dense_values), self._all_reduce_alg, self._num_packs,
          self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10)
 
-    destinations = dense_values[0].devices
+    destinations = dense_values[0]._devices  # pylint: disable=protected-access
     grouped = _group_value_by_device(dense_values)
 
     device_grad_packs, tensor_packer = _pack_tensors(
@@ -1010,7 +1010,7 @@ class CollectiveAllReduce(CrossDeviceOps):
     devices = get_devices_from(destinations)
 
     if (isinstance(all_reduced, value_lib.Mirrored) and
-        (all_reduced.devices == devices)):
+        (all_reduced._devices == devices)):  # pylint: disable=protected-access
       return all_reduced
 
     # Convert `all_reduced` to a `Mirrored` object, as a simple and uniform
diff --git a/tensorflow/python/distribute/cross_device_ops_test.py b/tensorflow/python/distribute/cross_device_ops_test.py
index d1fdaf1c9eb..17be5de236e 100644
--- a/tensorflow/python/distribute/cross_device_ops_test.py
+++ b/tensorflow/python/distribute/cross_device_ops_test.py
@@ -44,7 +44,7 @@ def _get_devices(devices):
   if isinstance(devices, (tuple, list)):
     return tuple(device_util.resolve(d) for d in devices)
   elif isinstance(devices, value_lib.DistributedValues):
-    return devices.devices
+    return devices._devices
   elif isinstance(devices, ops.Tensor):
     return (device_util.resolve(devices.device),)
   return (device_util.resolve(devices),)
@@ -124,7 +124,7 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
         self._assert_values_equal(l, r)
     else:
       if isinstance(left, value_lib.DistributedValues):
-        self.assertEqual(set(left.devices), set(right.devices))
+        self.assertEqual(set(left._devices), set(right._devices))
         self._assert_values_equal(left.values, right.values)
       else:
         self.assertEqual(
@@ -512,7 +512,7 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
         self._assert_values_equal(l, r, sess)
     else:
       if isinstance(left, value_lib.DistributedValues):
-        self.assertEqual(set(left.devices), set(right.devices))
+        self.assertEqual(set(left._devices), set(right._devices))
         self._assert_values_equal(left.values, right.values, sess)
       else:
         self.assertEqual(
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index 20b1274f81f..630ae85ff97 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -578,7 +578,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
       with ops.device(colocate_with.device):
         return next_creator(**kwargs)
     else:
-      devices = colocate_with.devices
+      devices = colocate_with._devices  # pylint: disable=protected-access
 
     def _real_mirrored_creator(**kwargs):  # pylint: disable=g-missing-docstring
       value_list = []
diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py
index fa7e4a8fcd4..f1f693d30dc 100644
--- a/tensorflow/python/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_strategy_test.py
@@ -714,18 +714,18 @@ class MirroredVariableUpdateTest(test.TestCase):
       self.assertEqual(7.0, self.evaluate(mirrored_var.values[0]))
       self.assertEqual(7.0, self.evaluate(mirrored_var.values[1]))
       self.assertEqual(
-          distribution.extended.worker_devices[0], mirrored_var.devices[0])
+          distribution.extended.worker_devices[0], mirrored_var._devices[0])
       self.assertEqual(
-          distribution.extended.worker_devices[1], mirrored_var.devices[1])
+          distribution.extended.worker_devices[1], mirrored_var._devices[1])
 
       # read_value == False
       self.evaluate(mirrored_var.assign_add(2.0, read_value=False))
       self.assertEqual(9.0, self.evaluate(mirrored_var.values[0]))
       self.assertEqual(9.0, self.evaluate(mirrored_var.values[1]))
       self.assertEqual(
-          distribution.extended.worker_devices[0], mirrored_var.devices[0])
+          distribution.extended.worker_devices[0], mirrored_var._devices[0])
       self.assertEqual(
-          distribution.extended.worker_devices[1], mirrored_var.devices[1])
+          distribution.extended.worker_devices[1], mirrored_var._devices[1])
 
   def testAssignAddMirroredVarReplicaContext(self, distribution):
     def var_fn():
@@ -780,9 +780,9 @@ class MirroredVariableUpdateTest(test.TestCase):
       self.assertEqual(3.0, self.evaluate(mirrored_var.values[0]))
       self.assertEqual(3.0, self.evaluate(mirrored_var.values[1]))
       self.assertEqual(
-          distribution.extended.worker_devices[0], mirrored_var.devices[0])
+          distribution.extended.worker_devices[0], mirrored_var._devices[0])
       self.assertEqual(
-          distribution.extended.worker_devices[1], mirrored_var.devices[1])
+          distribution.extended.worker_devices[1], mirrored_var._devices[1])
 
   def testAssignSubMirroredVarReplicaContext(self, distribution):
     def var_fn():
diff --git a/tensorflow/python/distribute/mirrored_variable_test.py b/tensorflow/python/distribute/mirrored_variable_test.py
index 37db3c4d4a0..f6ec7ccdc8d 100644
--- a/tensorflow/python/distribute/mirrored_variable_test.py
+++ b/tensorflow/python/distribute/mirrored_variable_test.py
@@ -94,9 +94,11 @@ class MirroredVariableCreationTest(test.TestCase):
     self.assertIsInstance(var, values.MirroredVariable)
     self.assertEqual(name, var.name)
     self.assertIs(strategy, var.distribute_strategy)
-    for i, d in enumerate(var.devices):
-      self.assertEqual(d, var.values[i].device)
-      self.assertIs(strategy, var.values[i]._distribute_strategy)  # pylint: disable=protected-access
+    for i, d in enumerate(var._devices):
+      self.assertEqual(d, strategy.experimental_local_results(var)[i].device)
+      self.assertIs(
+          strategy,
+          strategy.experimental_local_results(var)[i]._distribute_strategy)  # pylint: disable=protected-access
 
   def testVariableInFuncGraph(self, distribution):
 
@@ -234,9 +236,9 @@ class MirroredVariableCreationTest(test.TestCase):
           model_fn, args=(features,))
       for kernel, bias in result:
         self.assertIsInstance(kernel, values.MirroredVariable)
-        self.assertAllDifferent(kernel.values)
+        self.assertAllDifferent(distribution.experimental_local_results(kernel))
         self.assertIsInstance(bias, values.MirroredVariable)
-        self.assertAllDifferent(kernel.values)
+        self.assertAllDifferent(distribution.experimental_local_results(kernel))
 
   def testWithVariableAndVariableScope(self, distribution):
 
@@ -335,12 +337,16 @@ class MirroredVariableCreationTest(test.TestCase):
     with distribution.scope():
       v0, v1 = distribution.extended.call_for_each_replica(create_fn)
       self.evaluate(v0.initializer)
-      self.assertEqual(2.0, self.evaluate(v0.values[0]))
-      self.assertEqual(2.0, self.evaluate(v0.values[1]))
+      self.assertEqual(
+          2.0, self.evaluate(distribution.experimental_local_results(v0)[0]))
+      self.assertEqual(
+          2.0, self.evaluate(distribution.experimental_local_results(v0)[1]))
       self.assertEqual(2.0, self.evaluate(distribution.extended.read_var(v0)))
       self.evaluate(v1.initializer)
-      self.assertEqual(3.0, self.evaluate(v1.values[0]))
-      self.assertEqual(3.0, self.evaluate(v1.values[1]))
+      self.assertEqual(
+          3.0, self.evaluate(distribution.experimental_local_results(v1)[0]))
+      self.assertEqual(
+          3.0, self.evaluate(distribution.experimental_local_results(v1)[1]))
       self.assertEqual(3.0, self.evaluate(distribution.extended.read_var(v1)))
 
       def replica_id_plus_one():
@@ -357,20 +363,23 @@ class MirroredVariableCreationTest(test.TestCase):
 
       # Update "sync on read" variable.
       self.evaluate(distribution.group(update0a))
-      self.assertEqual(2.0 + 5.0, self.evaluate(v0.values[0]))
+      local_results = self.evaluate(distribution.experimental_local_results(v0))
+      self.assertEqual(2.0 + 5.0, local_results[0])
       # Writes are not synchronized for "sync on read" variables,
       # so device[1] can end up with a different value.
-      self.assertEqual(2.0 + 2 * 5.0, self.evaluate(v0.values[1]))
+      self.assertEqual(2.0 + 2 * 5.0, local_results[1])
       # Always reads from device 0.
       self.assertEqual(2.0 + 5.0,
                        self.evaluate(distribution.extended.read_var(v0)))
 
       # Update "sync on write" variable.
       self.evaluate(distribution.group(update1a))
-      self.assertEqual(3.0 + 7.0, self.evaluate(v1.values[0]))
+      local_results1 = self.evaluate(
+          distribution.experimental_local_results(v1))
+      self.assertEqual(3.0 + 7.0, local_results1[0])
       # Writes are synchronized for v1, only the argument to assign_add on
       # device[0] is used.
-      self.assertEqual(3.0 + 7.0, self.evaluate(v1.values[1]))
+      self.assertEqual(3.0 + 7.0, local_results1[1])
       self.assertEqual(3.0 + 7.0,
                        self.evaluate(distribution.extended.read_var(v1)))
 
@@ -385,15 +394,18 @@ class MirroredVariableCreationTest(test.TestCase):
       self.evaluate(distribution.group(update0b))
 
       # Update "sync on read" variable.
-      self.assertEqual(2.0 + 5.0 + 11.0, self.evaluate(v0.values[0]))
-      self.assertEqual(2.0 + 2 * 5.0 + 2 * 11.0, self.evaluate(v0.values[1]))
+      local_results = self.evaluate(distribution.experimental_local_results(v0))
+      self.assertEqual(2.0 + 5.0 + 11.0, local_results[0])
+      self.assertEqual(2.0 + 2 * 5.0 + 2 * 11.0, local_results[1])
       self.assertEqual(2.0 + 5.0 + 11.0,
                        self.evaluate(distribution.extended.read_var(v0)))
 
       # Update "sync on write" variable.
       self.evaluate(distribution.group(update1b))
-      self.assertEqual(3.0 + 7.0 + 13.0, self.evaluate(v1.values[0]))
-      self.assertEqual(3.0 + 7.0 + 13.0, self.evaluate(v1.values[1]))
+      local_results1 = self.evaluate(
+          distribution.experimental_local_results(v1))
+      self.assertEqual(3.0 + 7.0 + 13.0, local_results1[0])
+      self.assertEqual(3.0 + 7.0 + 13.0, local_results1[1])
       self.assertEqual(3.0 + 7.0 + 13.0,
                        self.evaluate(distribution.extended.read_var(v1)))
 
@@ -584,7 +596,11 @@ class MirroredVariableCreationTest(test.TestCase):
       self.evaluate(variables.global_variables_initializer())
       # Assert that the aggregated value of the sync on read var is the sum
       # of the individual values before running the update ops.
-      self.assertEqual(1.0, self.evaluate(ret_v_sum.values[0].read_value()))
+      self.assertEqual(
+          1.0,
+          self.evaluate(
+              distribution.experimental_local_results(ret_v_sum)
+              [0].read_value()))
       self.assertEqual(2.0, self.evaluate(ret_v_sum))
 
       # Apply updates.
@@ -593,7 +609,11 @@ class MirroredVariableCreationTest(test.TestCase):
       self.evaluate(update_ops)
       # Assert that the aggregated value of the sync on read vars is the sum
       # of the individual values after running the update ops.
-      self.assertEqual(5.0, self.evaluate(ret_v_sum.values[0].read_value()))
+      self.assertEqual(
+          5.0,
+          self.evaluate(
+              distribution.experimental_local_results(ret_v_sum)
+              [0].read_value()))
       self.assertEqual(10.0, self.evaluate(ret_v_sum))
 
   def testVarDistributeStrategy(self, distribution):
diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py
index a807d4ae9ff..d27bacf6be7 100644
--- a/tensorflow/python/distribute/parameter_server_strategy.py
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@@ -486,7 +486,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
 
     def _select_fn(x):  # pylint: disable=g-missing-docstring
       if isinstance(x, values.Mirrored):
-        if len(x.devices) == 1:
+        if len(x._devices) == 1:  # pylint: disable=protected-access
           return x._primary  # pylint: disable=protected-access
         else:
           raise ValueError(
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index e0a25ba84ea..54e2028ccaf 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -621,7 +621,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
       with ops.device(colocate_with.device):
         return next_creator(**kwargs)
     else:
-      devices = colocate_with.devices
+      devices = colocate_with._devices  # pylint: disable=protected-access
 
     def _real_mirrored_creator(**kwargs):  # pylint: disable=g-missing-docstring
       initial_value = None
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index fb3e2ffd817..6210d51124b 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -98,7 +98,7 @@ class DistributedValues(object):
     return self._values
 
   @property
-  def devices(self):
+  def _devices(self):
     return tuple(v.device for v in self._values)
 
   def __str__(self):
@@ -505,7 +505,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
   @property
   def op(self):
     # We want cross-replica code that does some var.op.X calls
-    # to work (even if the current device isn't in self.devices), but
+    # to work (even if the current device isn't in self._devices), but
     # other uses of var.op in a cross-replica context to fail.
     if distribution_strategy_context.in_cross_replica_context():
       return DistributedVarOp(self._primary.op.name, self._primary.op.graph,
@@ -1014,7 +1014,7 @@ class _SyncOnReadSaveable(saver.BaseSaverBuilder.SaveableObject):
     # when saving.
     tensor, = restored_tensors
     if self._sync_on_read_variable.aggregation == vs.VariableAggregation.SUM:
-      tensor = math_ops.cast(tensor / len(self._sync_on_read_variable.devices),
+      tensor = math_ops.cast(tensor / len(self._sync_on_read_variable._devices),  # pylint: disable=protected-access
                              self._sync_on_read_variable.dtype)
     return control_flow_ops.group(
         tuple(

From 11cb6e2c24e9c7c7cb92b6bfd0994151015fa1c8 Mon Sep 17 00:00:00 2001
From: Srinivas Vasudevan <srvasude@google.com>
Date: Tue, 18 Feb 2020 16:23:27 -0800
Subject: [PATCH 191/442] Add XLA Op registrations for IgammaGradA and
 RandomGammaGrad.

    - This allows gradients of the igamma function to work in XLA, along with reparameterized gamma samplers.

PiperOrigin-RevId: 295849184
Change-Id: I7fed512089ee843271211478b7b375ce4a77b5fb
---
 .../compiler/jit/mark_for_compilation_pass.cc |   2 +
 .../compiler/tests/special_math_test.py       | 110 ++++++++-
 .../compiler/tf2xla/kernels/binary_ops.cc     |  17 ++
 tensorflow/compiler/tf2xla/python/xla.py      |   3 +
 tensorflow/compiler/xla/client/lib/math.cc    | 228 ++++++++++++++++--
 tensorflow/compiler/xla/client/lib/math.h     |   8 +
 tensorflow/compiler/xla/python/xla.cc         |   2 +
 tensorflow/compiler/xla/python/xla_client.py  |   1 +
 8 files changed, 353 insertions(+), 18 deletions(-)

diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 4bb1fde7a9b..b36fe6ae5e9 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -1883,6 +1883,8 @@ absl::flat_hash_set<string> GetKnownXLAWhitelistOp() {
                                      "EmptyTensorList",
                                      "ExtractImagePatches",
                                      "Igamma",
+                                     "IgammaGradA",
+                                     "RandomGammaGrad",
                                      "Igammac",
                                      "FFT",
                                      "FFT2D",
diff --git a/tensorflow/compiler/tests/special_math_test.py b/tensorflow/compiler/tests/special_math_test.py
index 7beebf0720e..b3abc40f82d 100644
--- a/tensorflow/compiler/tests/special_math_test.py
+++ b/tensorflow/compiler/tests/special_math_test.py
@@ -29,6 +29,10 @@ import scipy.special as sps
 import six
 
 from tensorflow.compiler.tests import xla_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import gen_random_ops
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -39,6 +43,13 @@ flags.DEFINE_bool('vary_seed', False,
 NUM_SAMPLES = int(1e3)
 
 
+# This is df/da / df/dx, where f = igamma.
+def implicit_reparameterization_grad(a, x):
+  log_prob = math_ops.xlogy(a - 1., x) - math_ops.lgamma(a) - x
+  prob = math_ops.exp(log_prob)
+  return -gen_math_ops.igamma_grad_a(a, x) / prob
+
+
 class IgammaTest(xla_test.XLATestCase, parameterized.TestCase):
 
   def setUp(self):
@@ -48,9 +59,15 @@ class IgammaTest(xla_test.XLATestCase, parameterized.TestCase):
         answer = int(entropy.encode('hex'), 16)
       else:
         answer = int.from_bytes(entropy, 'big')
-      np.random.seed(answer)
+      np.random.seed(answer % (2**32 - 1))
     super(IgammaTest, self).setUp()
 
+  # Skip Float64 test on TPU due to missing ops.
+  def maybe_skip_test(self, dtype):
+    if self.device not in ['XLA_GPU', 'XLA_CPU', 'CPU'] and dtype == np.float64:
+      self.skipTest(
+          'Skipping test because some F64 operations not supported on TPU.')
+
   @parameterized.parameters((np.float32, 1e-2, 1e-11),
                             (np.float64, 1e-4, 1e-30))
   def testIgammaSmallValues(self, dtype, rtol, atol):
@@ -93,6 +110,97 @@ class IgammaTest(xla_test.XLATestCase, parameterized.TestCase):
         actual = sess.run(math_ops.igamma(a, x))
     self.assertAllClose(expected_values, actual, atol=atol, rtol=rtol)
 
+  # We don't check small values because the numerical gradients become quite
+  # large.
+  @parameterized.parameters((np.float32, 0.09), (np.float64, 1e-7))
+  def testIgammaGradMediumValues(self, dtype, tolerance):
+    self.maybe_skip_test(dtype)
+    with self.session():
+      with self.test_scope():
+        x = constant_op.constant(
+            np.random.uniform(low=1., high=100.,
+                              size=[NUM_SAMPLES]).astype(dtype))
+        a = constant_op.constant(
+            np.random.uniform(low=1., high=100.,
+                              size=[NUM_SAMPLES]).astype(dtype))
+
+        f = lambda b: math_ops.igamma(b, x)
+        max_error = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(f, x=[a], delta=1e-3))
+    self.assertLessEqual(max_error, tolerance)
+
+  @parameterized.parameters((np.float32, 0.5), (np.float64, 1e-7))
+  def testIgammaGradLargeValues(self, dtype, tolerance):
+    self.maybe_skip_test(dtype)
+    with self.session():
+      with self.test_scope():
+        x = constant_op.constant(
+            np.random.uniform(low=100., high=int(1e4),
+                              size=[NUM_SAMPLES]).astype(dtype))
+        a = constant_op.constant(
+            np.random.uniform(low=100., high=int(1e4),
+                              size=[NUM_SAMPLES]).astype(dtype))
+
+        f = lambda b: math_ops.igamma(b, x)
+        max_error = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(f, x=[a], delta=1e-2))
+    self.assertLessEqual(max_error, tolerance)
+
+  @parameterized.parameters((np.float32, 1e-2, 1e-11),
+                            (np.float64, 1e-4, 1e-30))
+  def testRandomGammaGradSmallValues(self, dtype, rtol, atol):
+    self.maybe_skip_test(dtype)
+    # Test values near zero.
+
+    with self.session() as sess:
+      with self.test_scope():
+        x = constant_op.constant(
+            np.random.uniform(
+                low=np.finfo(dtype).tiny, high=1.,
+                size=[NUM_SAMPLES]).astype(dtype))
+        a = constant_op.constant(
+            np.random.uniform(
+                low=np.finfo(dtype).tiny, high=1.,
+                size=[NUM_SAMPLES]).astype(dtype))
+        gamma_sample_grad = gen_random_ops.random_gamma_grad(a, x)
+        actual_grad = implicit_reparameterization_grad(a, x)
+        gamma_sample_grad, actual_grad = sess.run(
+            [gamma_sample_grad, actual_grad])
+        # We do this because the ratio computed in
+        # implicit_reparameterization_grad can very easily result in a NaN due
+        # to the computed numerator and denominator zeroing out.
+        gamma_sample_grad = gamma_sample_grad[
+            ~np.logical_or(np.isnan(actual_grad), np.isinf(actual_grad))]
+        actual_grad = actual_grad[
+            ~np.logical_or(np.isnan(actual_grad), np.isinf(actual_grad))]
+    self.assertAllClose(actual_grad, gamma_sample_grad, atol=atol, rtol=rtol)
+
+  @parameterized.parameters((np.float32, 1e-2, 1e-11),
+                            (np.float64, 1e-4, 1e-30))
+  def testRandomGammaGradMediumValues(self, dtype, rtol, atol):
+    self.maybe_skip_test(dtype)
+
+    with self.session() as sess:
+      with self.test_scope():
+        x = constant_op.constant(
+            np.random.uniform(low=1., high=10.,
+                              size=[NUM_SAMPLES]).astype(dtype))
+        a = constant_op.constant(
+            np.random.uniform(low=1., high=10.,
+                              size=[NUM_SAMPLES]).astype(dtype))
+        gamma_sample_grad = gen_random_ops.random_gamma_grad(a, x)
+        actual_grad = implicit_reparameterization_grad(a, x)
+        gamma_sample_grad, actual_grad = sess.run(
+            [gamma_sample_grad, actual_grad])
+        # We do this because the ratio computed in
+        # implicit_reparameterization_grad can very easily result in a NaN due
+        # to the computed numerator and denominator zeroing out.
+        gamma_sample_grad = gamma_sample_grad[
+            ~np.logical_or(np.isnan(actual_grad), np.isinf(actual_grad))]
+        actual_grad = actual_grad[
+            ~np.logical_or(np.isnan(actual_grad), np.isinf(actual_grad))]
+    self.assertAllClose(actual_grad, gamma_sample_grad, atol=atol, rtol=rtol)
+
 
 if __name__ == '__main__':
   os.environ['XLA_FLAGS'] = '--xla_cpu_enable_fast_math=false'
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index 62ed069b4f0..0ea851e9325 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -264,6 +264,23 @@ xla::XlaOp IgammaImpl(xla::XlaOp x, xla::XlaOp y,
 
 XLA_MAKE_BINARY(Igamma, IgammaImpl(lhs, rhs, broadcast_helper));
 
+xla::XlaOp IgammaGradAImpl(xla::XlaOp x, xla::XlaOp y,
+                           const BCast& broadcast_helper) {
+  std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
+  return xla::IgammaGradA(x, y);
+}
+
+XLA_MAKE_BINARY(IgammaGradA, IgammaGradAImpl(lhs, rhs, broadcast_helper));
+
+xla::XlaOp RandomGammaGradImpl(xla::XlaOp x, xla::XlaOp y,
+                               const BCast& broadcast_helper) {
+  std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
+  return xla::RandomGammaGrad(x, y);
+}
+
+XLA_MAKE_BINARY(RandomGammaGrad,
+                RandomGammaGradImpl(lhs, rhs, broadcast_helper));
+
 xla::XlaOp IgammacImpl(xla::XlaOp x, xla::XlaOp y,
                        const BCast& broadcast_helper) {
   std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index 3efdda15a94..0df61da57a3 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -34,6 +34,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bitwise_ops
 from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import gen_random_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 
@@ -200,6 +201,8 @@ shift_right_logical = _broadcasting_binary_op(_shift_right_logical_helper)
 shift_right_arithmetic = _broadcasting_binary_op(_shift_right_arithmetic_helper)
 
 igamma = _broadcasting_binary_op(math_ops.igamma)
+igamma_grad_a = _broadcasting_binary_op(gen_math_ops.igamma_grad_a)
+random_gamma_grad = _broadcasting_binary_op(gen_random_ops.random_gamma_grad)
 igammac = _broadcasting_binary_op(math_ops.igammac)
 
 
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index 710ac478176..701479614aa 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -693,7 +693,10 @@ XlaOp Digamma(XlaOp input) {
 
 namespace {
 
+enum kIgammaMode { VALUE, DERIVATIVE, SAMPLE_DERIVATIVE };
+
 // Helper function for computing Igamma using a power series.
+template <kIgammaMode mode>
 XlaOp IgammaSeries(XlaOp ax, XlaOp x, XlaOp a, XlaOp enabled,
                    xla::PrimitiveType type) {
   // vals: (enabled, r, c, ans, x)
@@ -715,24 +718,60 @@ XlaOp IgammaSeries(XlaOp ax, XlaOp x, XlaOp a, XlaOp enabled,
     XlaOp c = vals[2];
     XlaOp ans = vals[3];
     XlaOp x = vals[4];
+    XlaOp dc_da = vals[5];
+    XlaOp dans_da = vals[6];
+
     r = r + ScalarLike(r, 1);
+    dc_da = dc_da * (x / r) + (ScalarLike(r, -1) * c * x) / (r * r);
+    dans_da = dans_da + dc_da;
     c = c * (x / r);
     ans = ans + c;
+    XlaOp conditional;
+    if (mode == VALUE) {
+      conditional = And(enabled, Gt(c / ans, Epsilon(builder, type)));
+    } else {
+      conditional =
+          And(enabled, Gt(Abs(dc_da / dans_da), Epsilon(builder, type)));
+    }
+
     return std::vector<XlaOp>{
-        And(enabled, Gt(c / ans, Epsilon(builder, type))),
-        Select(enabled, r, vals[1]), Select(enabled, c, vals[2]),
-        Select(enabled, ans, vals[3]), Select(enabled, x, vals[4])};
+        conditional,
+        Select(enabled, r, vals[1]),
+        Select(enabled, c, vals[2]),
+        Select(enabled, ans, vals[3]),
+        Select(enabled, x, vals[4]),
+        Select(enabled, dc_da, vals[5]),
+        Select(enabled, dans_da, vals[6]),
+    };
   };
   auto& b = *ax.builder();
   return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    std::vector<XlaOp> vals = {enabled, a, FullLike(a, 1), FullLike(a, 1), x};
+    std::vector<XlaOp> vals = {
+        enabled,        a, FullLike(a, 1), FullLike(a, 1), x, FullLike(a, 0),
+        FullLike(a, 0),
+    };
+
     TF_ASSIGN_OR_RETURN(vals, WhileLoopHelper(cond, body, vals, "igamma", &b));
     XlaOp ans = vals[3];
-    return (ans * ax) / a;
+    XlaOp dans_da = vals[6];
+    if (mode == VALUE) {
+      return (ans * ax) / a;
+    }
+
+    XlaOp dlogax_da = Log(x) - Digamma(a + ScalarLike(a, 1));
+
+    switch (mode) {
+      case DERIVATIVE:
+        return ax * (ans * dlogax_da + dans_da) / a;
+      case SAMPLE_DERIVATIVE:
+      default:
+        return -(dans_da + ans * dlogax_da) * x / a;
+    }
   });
 }
 
 // Helper function for computing Igammac using a continued fraction.
+template <kIgammaMode mode>
 XlaOp IgammacContinuedFraction(XlaOp ax, XlaOp x, XlaOp a, XlaOp enabled,
                                xla::PrimitiveType type) {
   // vals: enabled, ans, t, y, z, c, pkm1, qkm1, pkm2, qkm2
@@ -754,6 +793,13 @@ XlaOp IgammacContinuedFraction(XlaOp ax, XlaOp x, XlaOp a, XlaOp enabled,
     XlaOp qkm1 = vals[7];
     XlaOp pkm2 = vals[8];
     XlaOp qkm2 = vals[9];
+
+    XlaOp dpkm2_da = vals[10];
+    XlaOp dqkm2_da = vals[11];
+    XlaOp dpkm1_da = vals[12];
+    XlaOp dqkm1_da = vals[13];
+    XlaOp dans_da = vals[14];
+
     c = c + ScalarLike(c, 1);
     y = y + ScalarLike(y, 1);
     z = z + ScalarLike(z, 2);
@@ -762,18 +808,46 @@ XlaOp IgammacContinuedFraction(XlaOp ax, XlaOp x, XlaOp a, XlaOp enabled,
     XlaOp qk = qkm1 * z - qkm2 * yc;
     XlaOp qk_is_nonzero = Ne(qk, ScalarLike(qk, 0));
     XlaOp r = pk / qk;
+
     t = Select(qk_is_nonzero, Abs((ans - r) / r), FullLike(t, 1));
     ans = Select(qk_is_nonzero, r, ans);
+
+    XlaOp dpk_da = dpkm1_da * z - pkm1 - dpkm2_da * yc + pkm2 * c;
+    XlaOp dqk_da = dqkm1_da * z - qkm1 - dqkm2_da * yc + qkm2 * c;
+    XlaOp dans_da_new =
+        Select(qk_is_nonzero, (dpk_da - ans * dqk_da) / qk, dans_da);
+    XlaOp grad_conditional =
+        Select(qk_is_nonzero, Abs(dans_da_new - dans_da), FullLike(dans_da, 1));
+
     pkm2 = pkm1;
     pkm1 = pk;
     qkm2 = qkm1;
     qkm1 = qk;
+
+    dpkm2_da = dpkm1_da;
+    dqkm2_da = dqkm1_da;
+    dpkm1_da = dpk_da;
+    dqkm1_da = dqk_da;
+
     XlaOp rescale = Gt(Abs(pk), Reciprocal(Epsilon(builder, type)));
     pkm2 = Select(rescale, pkm2 * Epsilon(builder, type), pkm2);
     pkm1 = Select(rescale, pkm1 * Epsilon(builder, type), pkm1);
     qkm2 = Select(rescale, qkm2 * Epsilon(builder, type), qkm2);
     qkm1 = Select(rescale, qkm1 * Epsilon(builder, type), qkm1);
-    return std::vector<XlaOp>{And(enabled, Gt(t, Epsilon(builder, type))),
+
+    dpkm2_da = Select(rescale, dpkm2_da * Epsilon(builder, type), dpkm2_da);
+    dqkm2_da = Select(rescale, dqkm2_da * Epsilon(builder, type), dqkm2_da);
+    dpkm1_da = Select(rescale, dpkm1_da * Epsilon(builder, type), dpkm1_da);
+    dqkm1_da = Select(rescale, dqkm1_da * Epsilon(builder, type), dqkm1_da);
+
+    XlaOp conditional;
+    if (mode == VALUE) {
+      conditional = And(enabled, Gt(t, Epsilon(builder, type)));
+    } else {
+      conditional = And(enabled, Gt(grad_conditional, Epsilon(builder, type)));
+    }
+
+    return std::vector<XlaOp>{conditional,
                               Select(enabled, ans, vals[1]),
                               Select(enabled, t, vals[2]),
                               Select(enabled, y, vals[3]),
@@ -782,7 +856,12 @@ XlaOp IgammacContinuedFraction(XlaOp ax, XlaOp x, XlaOp a, XlaOp enabled,
                               Select(enabled, pkm1, vals[6]),
                               Select(enabled, qkm1, vals[7]),
                               Select(enabled, pkm2, vals[8]),
-                              Select(enabled, qkm2, vals[9])};
+                              Select(enabled, qkm2, vals[9]),
+                              Select(enabled, dpkm2_da, vals[10]),
+                              Select(enabled, dqkm2_da, vals[11]),
+                              Select(enabled, dpkm1_da, vals[12]),
+                              Select(enabled, dqkm1_da, vals[13]),
+                              Select(enabled, dans_da_new, vals[14])};
   };
 
   auto& b = *ax.builder();
@@ -796,11 +875,31 @@ XlaOp IgammacContinuedFraction(XlaOp ax, XlaOp x, XlaOp a, XlaOp enabled,
     XlaOp qkm1 = z * x;
     XlaOp ans = pkm1 / qkm1;
     XlaOp t = FullLike(x, 1);
-    std::vector<XlaOp> vals = {enabled, ans,  t,    y,    z,
-                               c,       pkm1, qkm1, pkm2, qkm2};
+    XlaOp dpkm2_da = FullLike(x, 0);
+    XlaOp dqkm2_da = FullLike(x, 0);
+    XlaOp dpkm1_da = FullLike(x, 0);
+    XlaOp dqkm1_da = -x;
+    XlaOp dans_da = (dpkm1_da - ans * dqkm1_da) / qkm1;
+    std::vector<XlaOp> vals = {enabled,  ans,      t,        y,        z,
+                               c,        pkm1,     qkm1,     pkm2,     qkm2,
+                               dpkm2_da, dqkm2_da, dpkm1_da, dqkm1_da, dans_da};
+
     TF_ASSIGN_OR_RETURN(vals, WhileLoopHelper(cond, body, vals, "igammac", &b));
     ans = vals[1];
-    return ans * ax;
+    if (mode == VALUE) {
+      return ans * ax;
+    }
+
+    dans_da = vals[14];
+    XlaOp dlogax_da = Log(x) - Digamma(a);
+
+    switch (mode) {
+      case DERIVATIVE:
+        return ax * (ans * dlogax_da + dans_da);
+      case SAMPLE_DERIVATIVE:
+      default:
+        return -(dans_da + ans * dlogax_da) * x;
+    }
   });
 }
 
@@ -820,9 +919,9 @@ XlaOp Igamma(XlaOp a, XlaOp x) {
     const double nan = std::numeric_limits<double>::quiet_NaN();
     XlaOp output = Select(
         use_igammac,
-        ScalarLike(a, 1) -
-            IgammacContinuedFraction(ax, x, a, And(enabled, use_igammac), type),
-        IgammaSeries(ax, x, a, And(enabled, Not(use_igammac)), type));
+        ScalarLike(a, 1) - IgammacContinuedFraction<VALUE>(
+                               ax, x, a, And(enabled, use_igammac), type),
+        IgammaSeries<VALUE>(ax, x, a, And(enabled, Not(use_igammac)), type));
     output = Select(underflow, ZerosLike(output), output);
     output = Select(x_is_zero, ZerosLike(output), output);
     output = Select(Or(domain_error, is_nan), FullLike(a, nan), output);
@@ -852,6 +951,101 @@ XlaOp Igamma(XlaOp a, XlaOp x) {
   });
 }
 
+XlaOp IgammaGradA(XlaOp a, XlaOp x) {
+  auto& b = *a.builder();
+  auto doit = [&b](XlaOp a, XlaOp x, PrimitiveType type) -> XlaOp {
+    XlaOp is_nan = Or(IsNan(a), IsNan(x));
+    XlaOp x_is_zero = Eq(x, ScalarLike(x, 0));
+    XlaOp domain_error = Or(Lt(x, ScalarLike(x, 0)), Le(a, ScalarLike(a, 0)));
+    XlaOp use_igammac = And(Gt(x, ScalarLike(x, 1)), Gt(x, a));
+    XlaOp ax = a * Log(x) - x - Lgamma(a);
+    XlaOp underflow = Lt(ax, -Log(MaxFiniteValue(&b, type)));
+    ax = Exp(ax);
+    XlaOp enabled = Not(Or(Or(Or(x_is_zero, domain_error), underflow), is_nan));
+    const double nan = std::numeric_limits<double>::quiet_NaN();
+    XlaOp output = Select(use_igammac,
+                          -IgammacContinuedFraction<DERIVATIVE>(
+                              ax, x, a, And(enabled, use_igammac), type),
+                          IgammaSeries<DERIVATIVE>(
+                              ax, x, a, And(enabled, Not(use_igammac)), type));
+    output = Select(underflow, ZerosLike(output), output);
+    output = Select(x_is_zero, ZerosLike(output), output);
+    output = Select(Or(domain_error, is_nan), FullLike(a, nan), output);
+    return output;
+  };
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto a_shape, b.GetShape(a));
+    TF_ASSIGN_OR_RETURN(auto x_shape, b.GetShape(x));
+    if (a_shape != x_shape) {
+      return InvalidArgument(
+          "Arguments to IgammaGradA must have equal shapes and types; got %s "
+          "and %s",
+          a_shape.ToString(), x_shape.ToString());
+    }
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("IgammaGradA", a));
+    bool needs_upcast =
+        a_shape.element_type() == F16 || a_shape.element_type() == BF16;
+
+    if (needs_upcast) {
+      a = ConvertElementType(a, F32);
+      x = ConvertElementType(x, F32);
+    }
+    XlaOp result = doit(a, x, a_shape.element_type());
+    if (needs_upcast) {
+      result = ConvertElementType(result, a_shape.element_type());
+    }
+    return result;
+  });
+}
+
+// Gradient of Gamma sample from Gamma(a, 1) with respect to `a`.
+XlaOp RandomGammaGrad(XlaOp a, XlaOp x) {
+  auto& b = *a.builder();
+  auto doit = [&b](XlaOp a, XlaOp x, PrimitiveType type) -> XlaOp {
+    XlaOp is_nan = Or(IsNan(a), IsNan(x));
+    XlaOp x_is_zero = Eq(x, ScalarLike(x, 0));
+    XlaOp domain_error = Or(Lt(x, ScalarLike(x, 0)), Le(a, ScalarLike(a, 0)));
+    XlaOp use_igammac = And(Gt(x, ScalarLike(x, 1)), Gt(x, a));
+    XlaOp ax = a * Log(x) - x - Lgamma(a);
+    XlaOp underflow = Lt(ax, -Log(MaxFiniteValue(&b, type)));
+    ax = Exp(ax);
+    XlaOp enabled = Not(Or(Or(Or(x_is_zero, domain_error), underflow), is_nan));
+    const double nan = std::numeric_limits<double>::quiet_NaN();
+    XlaOp output = Select(use_igammac,
+                          -IgammacContinuedFraction<SAMPLE_DERIVATIVE>(
+                              ax, x, a, And(enabled, use_igammac), type),
+                          IgammaSeries<SAMPLE_DERIVATIVE>(
+                              ax, x, a, And(enabled, Not(use_igammac)), type));
+    output = Select(underflow, ZerosLike(output), output);
+    output = Select(x_is_zero, ZerosLike(output), output);
+    output = Select(Or(domain_error, is_nan), FullLike(a, nan), output);
+    return output;
+  };
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto a_shape, b.GetShape(a));
+    TF_ASSIGN_OR_RETURN(auto x_shape, b.GetShape(x));
+    if (a_shape != x_shape) {
+      return InvalidArgument(
+          "Arguments to RandomGammaGrad must have equal shapes and types; got "
+          "%s and %s",
+          a_shape.ToString(), x_shape.ToString());
+    }
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("RandomGammaGrad", a));
+    bool needs_upcast =
+        a_shape.element_type() == F16 || a_shape.element_type() == BF16;
+
+    if (needs_upcast) {
+      a = ConvertElementType(a, F32);
+      x = ConvertElementType(x, F32);
+    }
+    XlaOp result = doit(a, x, a_shape.element_type());
+    if (needs_upcast) {
+      result = ConvertElementType(result, a_shape.element_type());
+    }
+    return result;
+  });
+}
+
 XlaOp Igammac(XlaOp a, XlaOp x) {
   auto& b = *a.builder();
   auto doit = [&b](XlaOp a, XlaOp x, PrimitiveType type) -> XlaOp {
@@ -863,10 +1057,10 @@ XlaOp Igammac(XlaOp a, XlaOp x) {
     ax = Exp(ax);
     XlaOp result =
         Select(use_igamma,
-               ScalarLike(a, 1) -
-                   IgammaSeries(ax, x, a, And(enabled, use_igamma), type),
-               IgammacContinuedFraction(ax, x, a, And(enabled, Not(use_igamma)),
-                                        type));
+               ScalarLike(a, 1) - IgammaSeries<VALUE>(
+                                      ax, x, a, And(enabled, use_igamma), type),
+               IgammacContinuedFraction<VALUE>(
+                   ax, x, a, And(enabled, Not(use_igamma)), type));
     return Select(underflow, ZerosLike(a),
                   Select(out_of_range, FullLike(a, 1), result));
   };
diff --git a/tensorflow/compiler/xla/client/lib/math.h b/tensorflow/compiler/xla/client/lib/math.h
index ac96a50aecc..f862372a288 100644
--- a/tensorflow/compiler/xla/client/lib/math.h
+++ b/tensorflow/compiler/xla/client/lib/math.h
@@ -61,6 +61,14 @@ XlaOp Digamma(XlaOp input);
 // Computes an approximation of the incomplete gamma function.
 XlaOp Igamma(XlaOp a, XlaOp x);
 
+// Computes an approximation of the derivative of the incomplete gamma function
+// with respect to a.
+XlaOp IgammaGradA(XlaOp a, XlaOp x);
+
+// Computes an approximation of the derivative of a sample `x` from a `Gamma(a,
+// 1)` distribution with respect to a.
+XlaOp RandomGammaGrad(XlaOp a, XlaOp x);
+
 // Computes an approximation of the complementary incomplete gamma function.
 XlaOp Igammac(XlaOp a, XlaOp x);
 
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index 07fff76668f..a8d4ccb7fd5 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -458,6 +458,8 @@ void BuildOpsSubmodule(py::module* m) {
 
   ops.def("Igamma", &Igamma);
   ops.def("Igammac", &Igammac);
+  ops.def("IgammaGradA", &IgammaGradA);
+  ops.def("RandomGammaGrad", &RandomGammaGrad);
   ops.def("RegularizedIncompleteBeta", &RegularizedIncompleteBeta);
 
 #define BINARY_OP(op)                                                 \
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index a8f29009d9e..65545306b0c 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -1698,6 +1698,7 @@ _BINARY_OPS = [
     'ShiftRightLogical',
     'Atan2',
     'Igamma',
+    'IgammaGradA',
     'Igammac',
     'Complex',
     'NextAfter',

From 9fae117054131a7d9b197fdf7eb0b7fbbece5df8 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Tue, 18 Feb 2020 16:31:30 -0800
Subject: [PATCH 192/442] Fix trt_mode_test and re-enable the test.

PiperOrigin-RevId: 295850803
Change-Id: I482b0614a2ef347b264f500c8b8e74d671f8247e
---
 tensorflow/python/compiler/tensorrt/BUILD                 | 3 +--
 tensorflow/python/compiler/tensorrt/test/trt_mode_test.py | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
index d51eeec1940..a7c206f4495 100644
--- a/tensorflow/python/compiler/tensorrt/BUILD
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -135,8 +135,7 @@ cuda_py_tests(
         "test/rank_two_test.py",
         "test/reshape_transpose_test.py",
         "test/topk_test.py",
-        # TODO(bixia): Reenable when b/149570314 is resolved.
-        # "test/trt_mode_test.py",
+        "test/trt_mode_test.py",
         "test/unary_test.py",
         "test/vgg_block_nchw_test.py",
         "test/vgg_block_test.py",
diff --git a/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py b/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
index 9a823ab56d4..f70afaf5df1 100644
--- a/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
@@ -126,7 +126,7 @@ class ExplicitBatchTest(TrtModeTestBase):
   def ShouldRunTest(self, run_params):
     # Only run for TRT 6 and above.
     ver = get_linked_tensorrt_version()
-    return ver[0] >= 6
+    return ver[0] >= 6 and (not run_params.use_calibration)
 
 
 class DynamicShapesTest(TrtModeTestBase):
@@ -155,7 +155,7 @@ class DynamicShapesTest(TrtModeTestBase):
   def ShouldRunTest(self, run_params):
     # Only run for TRT 6 and above.
     ver = get_linked_tensorrt_version()
-    return ver[0] >= 6
+    return ver[0] >= 6 and (not run_params.use_calibration)
 
 
 if __name__ == "__main__":

From 49aeb94a522d7b26f601bc3a710857723784ed63 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Tue, 18 Feb 2020 17:06:09 -0800
Subject: [PATCH 193/442] Enable splitting out TFL runtime verification from op
 verification.

This enables use cases where the elemental types of the operations during
conversion/optimization may differ from what the TFLite runtime supports. The
approach followed here is pretty local and it could be changed to something
more general, but this allows verifying TFLite ops without verifying supported
runtime until such time.

Made the change in as small a way as possible and so the op verification could be tightened, but no reduction in verification on the path to TFLite runtime.

A downside to this local approach is that it makes the autogenerated
documentation less useful for folks interested in seeing the runtime supported
types/constraints.

PiperOrigin-RevId: 295857615
Change-Id: I5fa1bb8d83b740fe359e8551e22907fcf6fdfeb7
---
 tensorflow/compiler/mlir/lite/BUILD           |  18 +-
 ...ator_converter_gen.cc => converter_gen.cc} | 111 ++++-
 .../mlir/lite/ir/tfl_op_interfaces.td         |  19 +
 tensorflow/compiler/mlir/lite/ir/tfl_ops.cc   |   1 +
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   | 466 ++++++++++--------
 .../lite/python/graphdef_to_tfl_flatbuffer.cc |   1 +
 tensorflow/compiler/mlir/lite/tests/ops.mlir  |   2 +-
 .../compiler/mlir/lite/tf_tfl_translate.cc    |   3 +
 .../compiler/mlir/lite/transforms/passes.h    |   3 +
 .../lite/transforms/runtime_type_verify.cc    |  52 ++
 10 files changed, 453 insertions(+), 223 deletions(-)
 rename tensorflow/compiler/mlir/lite/{operator_converter_gen.cc => converter_gen.cc} (75%)
 create mode 100644 tensorflow/compiler/mlir/lite/transforms/runtime_type_verify.cc

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index ce091dabd9e..1ab9b70555d 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -208,6 +208,7 @@ cc_library(
         "ir/tfl_ops.h.inc",
         "ir/tfl_ops_interface.cc.inc",
         "ir/tfl_ops_interface.h.inc",
+        "runtime_verifiers.inc",
         "utils/attribute_utils.cc",
     ],
     hdrs = [
@@ -303,12 +304,14 @@ cc_library(
         "transforms/optimize_functional_ops.cc",
         "transforms/prepare_composite_functions_tf.cc",
         "transforms/prepare_tf.cc",
+        "transforms/runtime_type_verify.cc",
         "transforms/split_merged_operands.cc",
         "transforms/trim_functions_tf.cc",
         "transforms/unroll_batch_matmul.cc",
         "transforms/while_loop_outline.cc",
     ],
     hdrs = [
+        "ir/tfl_ops_interface.h.inc",
         "transforms/dilated_conv.h",
         "transforms/passes.h",
         "transforms/unroll_batch_matmul.h",
@@ -461,9 +464,9 @@ cc_library(
 )
 
 tf_native_cc_binary(
-    name = "operator-converter-gen",
+    name = "converter-gen",
     srcs = [
-        "operator_converter_gen.cc",
+        "converter_gen.cc",
     ],
     deps = [
         "@llvm-project//llvm:support",
@@ -473,14 +476,18 @@ tf_native_cc_binary(
 )
 
 gentbl(
-    name = "operator_converter_inc",
+    name = "converter_inc",
     tbl_outs = [
         (
-            "",  # This driver has no options.
+            "--gen-operator-converters",
             "operator_converters.inc",
         ),
+        (
+            "--gen-runtime-verifiers",
+            "runtime_verifiers.inc",
+        ),
     ],
-    tblgen = ":operator-converter-gen",
+    tblgen = ":converter-gen",
     td_file = "ir/tfl_ops.td",
     td_srcs = [
         ":tensorflow_lite_ops_td_files",
@@ -650,6 +657,7 @@ tf_cc_binary(
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
     ],
 )
diff --git a/tensorflow/compiler/mlir/lite/operator_converter_gen.cc b/tensorflow/compiler/mlir/lite/converter_gen.cc
similarity index 75%
rename from tensorflow/compiler/mlir/lite/operator_converter_gen.cc
rename to tensorflow/compiler/mlir/lite/converter_gen.cc
index 6ebc71fd029..02d9ef45591 100644
--- a/tensorflow/compiler/mlir/lite/operator_converter_gen.cc
+++ b/tensorflow/compiler/mlir/lite/converter_gen.cc
@@ -28,6 +28,9 @@ limitations under the License.
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
 #include "mlir/TableGen/Attribute.h"  // TF:llvm-project
+#include "mlir/TableGen/Format.h"  // TF:llvm-project
+#include "mlir/TableGen/Operator.h"  // TF:llvm-project
+#include "mlir/TableGen/Predicate.h"  // TF:llvm-project
 
 using llvm::DefInit;
 using llvm::dyn_cast;
@@ -41,6 +44,19 @@ using llvm::SmallVector;
 using llvm::StringInit;
 using llvm::StringRef;
 
+enum ActionType {
+  OpConv,
+  RuntimeVerify,
+};
+
+// NOLINTNEXTLINE
+llvm::cl::opt<ActionType> action(
+    llvm::cl::desc("Action to perform:"),
+    llvm::cl::values(clEnumValN(OpConv, "gen-operator-converters",
+                                "Generate operator converters"),
+                     clEnumValN(RuntimeVerify, "gen-runtime-verifiers",
+                                "Generate TFLite runtime verifiers")));
+
 // Returns the associated option name for the given op definition.
 static inline std::string GetOperatorOptionName(const Record &def) {
   assert(def.getName().startswith("TFL_") && "unexpected op prefix");
@@ -342,8 +358,101 @@ static bool OperatorWritersMain(raw_ostream &os, RecordKeeper &records) {
   return false;
 }
 
+static void GenOperandResultVerifier(raw_ostream &os,
+                                     llvm::ArrayRef<llvm::Init *> values,
+                                     StringRef valueKind) {
+  mlir::tblgen::FmtContext fctx;
+
+  bool first = true;
+  for (auto static_value : llvm::enumerate(values)) {
+    auto *definit = llvm::cast<llvm::DefInit>(static_value.value());
+    auto *val = definit->getDef()->getValue("tflRuntimeTypePredicate");
+    if (!val) continue;
+
+    // Create code block on first type to verify.
+    if (first) {
+      os << "  {\n";
+      os << "    unsigned index = " << static_value.index() << ";\n";
+      first = false;
+    }
+
+    mlir::tblgen::Pred pred(dyn_cast<llvm::DefInit>(val->getValue()));
+    auto desc =
+        definit->getDef()->getValueAsString("tflRuntimeTypeDescription");
+
+    // Emit a loop to check all the dynamic values in the pack.
+    os << formatv("    for (Value v : top.getODS{0}{1}s({2})) {{\n",
+                  // Capitalize the first letter to match the function name
+                  valueKind.substr(0, 1).upper(), valueKind.substr(1),
+                  static_value.index());
+
+    os << "      (void)v;\n"
+       << "      if (!("
+       << tgfmt(pred.getCondition(), &fctx.withSelf("v.getType()")) << ")) {\n"
+       << formatv(
+              "        return op->emitOpError(\"{0} #\") << index "
+              "<< \" must be {1}, but got \" << v.getType();\n",
+              valueKind, desc)
+       << "      }\n"  // if
+       << "      ++index;\n"
+       << "    }\n";  // for
+  }
+
+  // Emit closing brace if needed.
+  if (!first) os << "  }\n";
+}
+
+// NOLINTNEXTLINE
+static bool RuntimeVerifierWriterMain(raw_ostream &os, RecordKeeper &records) {
+  emitSourceFileHeader("MLIR TFLite Runtime Verifiers", os);
+
+  // Retrieve all the definitions derived from TFL_Op and sort by record name.
+  std::vector<Record *> defs = records.getAllDerivedDefinitions("Op");
+  llvm::sort(defs, LessRecord());
+
+  // Iterate through all the ops defined.
+  for (const auto *def : defs) {
+    mlir::tblgen::Operator op(*def);
+    if (!op.getTrait("TflRuntimeVerifyOpInterface::Trait")) continue;
+
+    mlir::tblgen::FmtContext verify_ctx;
+    os << "::mlir::LogicalResult " << op.getCppClassName()
+       << "::VerifyTflRuntimeTypes(::mlir::Operation *op) {\n";
+    os << "  auto top = cast<" << op.getCppClassName() << ">(op); (void)top;\n";
+    verify_ctx.withOp("top");
+
+    for (int i = 0, e = op.getNumOperands(); i < e; ++i) {
+      for (int i = 0, e = op.getNumOperands(); i < e; ++i) {
+        auto &value = op.getOperand(i);
+        // Skip from from first variadic operands for now. Else getOperand index
+        // used below doesn't match.
+        if (value.isVariadic()) break;
+        if (!value.name.empty())
+          verify_ctx.addSubst(value.name, formatv("op->getOperand({0})", i));
+      }
+      for (int i = 0, e = op.getNumResults(); i < e; ++i) {
+        auto &value = op.getResult(i);
+        // Skip from from first variadic results for now. Else getResult index
+        // used below doesn't match.
+        if (value.isVariadic()) break;
+        if (!value.name.empty())
+          verify_ctx.addSubst(value.name, formatv("op->getResult({0})", i));
+      }
+    }
+    GenOperandResultVerifier(os, def->getValueAsDag("arguments")->getArgs(),
+                             "operand");
+    GenOperandResultVerifier(os, def->getValueAsDag("results")->getArgs(),
+                             "result");
+    os << "  return mlir::success();\n}\n";
+  }
+
+  return false;
+}
+
 int main(int argc, char **argv) {
   llvm::InitLLVM y(argc, argv);
   llvm::cl::ParseCommandLineOptions(argc, argv);
-  return TableGenMain(argv[0], &OperatorWritersMain);
+  if (action == ActionType::OpConv)
+    return TableGenMain(argv[0], &OperatorWritersMain);
+  return TableGenMain(argv[0], &RuntimeVerifierWriterMain);
 }
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td b/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
index 8c72e93d1aa..8e100538659 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td
@@ -71,4 +71,23 @@ def TFL_SparseOp : OpInterface<"SparseOpInterface"> {
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// TFL runtime type verification of operand/result types.
+
+def TFL_RuntimeVerification : OpInterface<"TflRuntimeVerifyOpInterface"> {
+  let description = [{
+    Interface to verify TFLite runtime op verification.
+
+    This verifies that the converted TFLite ops has operand/result type
+    supported by the TFLite runtime.
+  }];
+
+  let methods = [
+    StaticInterfaceMethod<
+      [{Returns whether the op's operands/results are supported by runtime.}],
+      "LogicalResult", "VerifyTflRuntimeTypes", (ins "Operation*":$op)
+    >,
+  ];
+}
+
 #endif // TFL_OP_INTERFACES
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index 2c9f7badb23..be70d20dc12 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -1872,6 +1872,7 @@ LogicalResult WhileOp::moveOutOfLoop(llvm::ArrayRef<mlir::Operation *> ops) {
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops_interface.cc.inc"
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.cc.inc"
+#include "tensorflow/compiler/mlir/lite/runtime_verifiers.inc"
 
 Operation *TensorFlowLiteDialect::materializeConstant(OpBuilder &builder,
                                                       Attribute value,
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 9444aab6ce8..3bb2b67be35 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -109,29 +109,63 @@ def TensorTypeAttr : TypeAttrBase<"TensorType", "Tensor type attribute">;
 // Derived shape attribute class.
 //===----------------------------------------------------------------------===//
 class DerivedShapeAttr<code body> : DerivedAttr<"ArrayRef<int64_t>", body>;
-class DerivedTFLiteTypeAttr<code body> : DerivedAttr<"tflite::TensorType", body>;
+class DerivedTFLiteTypeAttr<code body> :
+  DerivedAttr<"tflite::TensorType", body>;
+
+// These additional types/type constraints here are used to decouple the ops
+// from runtime support for the ops. Prefer to use these types when defining
+// new TF_Ops for uniformity.
+
+// TFL Runtime type predicate.
+class TFL_RuntimeType<TypeConstraint t> {
+  Pred tflRuntimeTypePredicate = t.predicate;
+  string tflRuntimeTypeDescription = t.description;
+}
+
+class TFL_AnyTypeOf<list<Type> allowedRuntimeTypes, string description = "",
+                    list<Type> allowedOpTypes = [AnyType]> :
+  AnyTypeOf<allowedOpTypes, description>,
+  TFL_RuntimeType<AnyTypeOf<allowedRuntimeTypes, description>>;
+
+class TFL_TensorOf<list<Type> allowedRuntimeTypes,
+                   list<Type> allowedOpTypes = [AnyType]> :
+  TensorOf<allowedOpTypes>, TFL_RuntimeType<TensorOf<allowedRuntimeTypes>>;
+
+class TFL_TensorOfOrNone<list<Type> allowedRuntimeTypes, string description = "",
+                         list<Type> allowedOpTypes = [AnyType]> :
+  AnyTypeOf<[TFL_TensorOf<allowedOpTypes>, NoneType], description>,
+  TFL_RuntimeType<AnyTypeOf<[TFL_TensorOf<allowedRuntimeTypes>, NoneType]>>;
+
+class TFL_VariadicTensorOf<list<Type> allowedRuntimeTypes,
+                   list<Type> allowedOpTypes = [AnyType]> :
+  Variadic<TensorOf<allowedOpTypes>>,
+  TFL_RuntimeType<Variadic<TensorOf<allowedRuntimeTypes>>>;
 
 def TFL_Int32Or64 : IntOfWidths<[32, 64]>;
 
-def TFL_FpTensor : TensorOf<[AnyFloat]>;
-
-def TFL_I32OrI64Tensor : TensorOf<[TFL_Int32Or64]>;
-
-def TFL_BoolTensor : TypeAlias<I1Tensor>;
-
+def TFL_BoolTensor : TFL_TensorOf<[I1]>;
+def TFL_FpOrI32OrI64Tensor : TFL_TensorOf<[AnyFloat, TFL_Int32Or64]>;
+def TFL_FpTensor : TFL_TensorOf<[AnyFloat]>;
+def TFL_I32OrI64Tensor : TFL_TensorOf<[TFL_Int32Or64]>;
+def TFL_I32Tensor : TFL_TensorOf<[I32]>;
+def TFL_I64Tensor : TFL_TensorOf<[I64]>;
 // TODO(jpienaar): Expand to all int types.
-def TFL_IntTensor : TypeAlias<I32Tensor, "tensor of any integer type">;
+def TFL_IntTensor : TypeAlias<TFL_I32Tensor, "tensor of any integer type">;
+
+class TFL_0DTensorOf<list<Type> allowedRuntimeTypes,
+                     list<Type> allowedOpTypes = [AnyType]> :
+  0DTensorOf<allowedOpTypes>, TFL_RuntimeType<TensorOf<allowedRuntimeTypes>>;
+class TFL_1DTensorOf<list<Type> allowedRuntimeTypes,
+                     list<Type> allowedOpTypes = [AnyType]> :
+  1DTensorOf<allowedOpTypes>, TFL_RuntimeType<TensorOf<allowedRuntimeTypes>>;
+class TFL_2DTensorOf<list<Type> allowedRuntimeTypes,
+                     list<Type> allowedOpTypes = [AnyType]> :
+  2DTensorOf<allowedOpTypes>, TFL_RuntimeType<TensorOf<allowedRuntimeTypes>>;
 
 // This is used to represent the type of "ref tensors" or tensors that are
 // used as variables to track state.
 def TFL_StatefulTensor : TypeAlias<AnyTensor, "stateful tensor">;
 
-// Tensor or None type.
-class TFL_TensorOfOrNone<list<Type> allowedTypes, string description = ""> :
-  AnyTypeOf<[TensorOf<allowedTypes>, NoneType], description>;
-
-def TFL_FpOrI32OrI64Tensor : TensorOf<[AnyFloat, TFL_Int32Or64]>;
-
 //===----------------------------------------------------------------------===//
 // Rank/Shape helpers.
 //===----------------------------------------------------------------------===//
@@ -255,7 +289,8 @@ def TFL_ComparisonBinaryBuilder : OpBuilder<
 //===----------------------------------------------------------------------===//
 
 class TFL_Op<string mnemonic, list<OpTrait> traits = []> :
-    Op<TFL_Dialect, mnemonic, traits> {
+    Op<TFL_Dialect, mnemonic, !listconcat(traits,
+      [DeclareOpInterfaceMethods<TFL_RuntimeVerification>])> {
   // FlatBuffer generation specific information.
   // -------------------------------------------
   // When generating the FlatBuffer output some operations have
@@ -360,11 +395,11 @@ def TFL_AddNOp : TFL_Op<"add_n", [Commutative, NoSideEffect, SameOperandsAndResu
   }];
 
   let arguments = (ins
-    Variadic<TensorOf<[F32, I32, QI16, QUI16]>>:$inputs
+    TFL_VariadicTensorOf<[F32, I32, QI16, QUI16]>:$inputs
   );
 
   let results = (outs
-    TensorOf<[F32, I32, QI16, QUI16]>:$sum
+    TFL_TensorOf<[F32, I32, QI16, QUI16]>:$sum
   );
 }
 
@@ -381,14 +416,14 @@ retained with length 1.
   }];
 
   let arguments = (ins
-    I1Tensor:$input,
-    I32Tensor:$reduction_indices,
+    TFL_BoolTensor:$input,
+    TFL_I32Tensor:$reduction_indices,
 
     DefaultValuedAttr<BoolAttr, "false">:$keep_dims
   );
 
   let results = (outs
-    I1Tensor:$output
+    TFL_BoolTensor:$output
   );
 
   let hasOptions = 1;
@@ -403,10 +438,10 @@ def TFL_TransposeConvOp:
     Performs transpose convolution operation on input.
   }];
 
-  let arguments = (
-    ins 1DTensorOf<[I32]>:$output_shape,
-    TensorOf<[F32, TFL_Uint8, QI8, QUI8]>:$weights,
-    TensorOf<[F32, TFL_Uint8, QI8, QUI8]>:$input,
+  let arguments = (ins
+    TFL_1DTensorOf<[I32]>:$output_shape,
+    TFL_TensorOf<[F32, TFL_Uint8, QI8, QUI8]>:$weights,
+    TFL_TensorOf<[F32, TFL_Uint8, QI8, QUI8]>:$input,
     TFL_PaddingAttr:$padding,
     I32Attr:$stride_h,
     I32Attr:$stride_w
@@ -478,7 +513,7 @@ def TFL_ArgMaxOp : TFL_Op<"arg_max", [NoSideEffect]> {
   }];
 
   let arguments = (
-    ins TensorOf<[F32, I32, I8, TFL_Uint8, QI8, QUI8]>:$input,
+    ins TFL_TensorOf<[F32, I32, I8, TFL_Uint8, QI8, QUI8]>:$input,
     TFL_I32OrI64Tensor:$dim
   );
 
@@ -506,7 +541,7 @@ def TFL_ArgMinOp : TFL_Op<"arg_min", [NoSideEffect]> {
   }];
 
   let arguments = (
-    ins TensorOf<[F32, I32, I8, TFL_Uint8, QI8, QUI8]>:$input,
+    ins TFL_TensorOf<[F32, I32, I8, TFL_Uint8, QI8, QUI8]>:$input,
     TFL_I32OrI64Tensor:$dim
   );
 
@@ -549,14 +584,14 @@ def TFL_ConcatenationOp : TFL_Op<"concatenation",
   }];
 
   let arguments = (
-    ins Variadic<TensorOf<
-      [F32, I64, I32, I16, I8, QI8, QUI8, QI16, TFL_Uint8]>>:$values,
+    ins TFL_VariadicTensorOf<
+      [F32, I64, I32, I16, I8, QI8, QUI8, QI16, TFL_Uint8]>:$values,
     I32Attr:$axis,
     TFL_AFAttr:$fused_activation_function
   );
 
   let results = (outs
-    TensorOf<
+    TFL_TensorOf<
       [F32, I64, I32, I16, I8, QI8, QUI8, QI16, TFL_Uint8]>:$output
   );
 
@@ -708,8 +743,8 @@ def TFL_FullyConnectedOp : TFL_Op<"fully_connected", [
   let summary = "Fully connected op";
 
   let arguments = (ins
-    TensorOf<[F32, QI8, QUI8, QI16, QUI16]>:$input,
-    TensorOf<[F32, QI8, QUI8, QI16, QUI16]>:$filter,
+    TFL_TensorOf<[F32, QI8, QUI8, QI16, QUI16]>:$input,
+    TFL_TensorOf<[F32, QI8, QUI8, QI16, QUI16]>:$filter,
     TFL_TensorOfOrNone<[F32, QI32, QUI32]>:$bias,
 
     TFL_AFAttr:$fused_activation_function,
@@ -719,7 +754,7 @@ def TFL_FullyConnectedOp : TFL_Op<"fully_connected", [
 
   // Depending on the weights format, this op can have one or two outputs.
   let results = (outs
-    Variadic<TensorOf<[F32, QI8, QUI8, QI16, QUI16]>>:$output
+    TFL_VariadicTensorOf<[F32, QI8, QUI8, QI16, QUI16]>:$output
   );
 
   let verifier = [{ return Verify(*this); }];
@@ -748,8 +783,8 @@ def TFL_GatherOp : TFL_Op<"gather", [
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I1, I8, I32, I64, TFL_Str, QI8, QUI8, QI16]>:$params,
-    TensorOf<[I32, I64]>:$indices,
+    TFL_TensorOf<[F32, I1, I8, I32, I64, TFL_Str, QI8, QUI8, QI16]>:$params,
+    TFL_TensorOf<[I32, I64]>:$indices,
     I32Attr:$axis
   );
 
@@ -761,7 +796,7 @@ def TFL_GatherOp : TFL_Op<"gather", [
   ];
 
   let results = (outs
-    TensorOf<[F32, I1, I8, I32, I64, TFL_Str, QI8, QUI8, QI16]>:$output
+    TFL_TensorOf<[F32, I1, I8, I32, I64, TFL_Str, QI8, QUI8, QI16]>:$output
   );
 
   let hasOptions = 1;
@@ -775,12 +810,12 @@ def TFL_GatherNdOp : TFL_Op<"gather_nd", [NoSideEffect]> {
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I8, I64, I32, TFL_Uint8]>:$params,
+    TFL_TensorOf<[F32, I8, I64, I32, TFL_Uint8]>:$params,
     TFL_I32OrI64Tensor:$indices
   );
 
   let results = (outs
-    TensorOf<[F32, I8, I64, I32, TFL_Uint8]>:$output
+    TFL_TensorOf<[F32, I8, I64, I32, TFL_Uint8]>:$output
   );
 }
 
@@ -794,8 +829,8 @@ def TFL_LessEqualOp : TFL_Op<"less_equal", [
   }];
 
   let arguments = (
-      ins TensorOf<[F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$lhs,
-      TensorOf<[F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$rhs);
+      ins TFL_TensorOf<[F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$lhs,
+      TFL_TensorOf<[F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$rhs);
 
   let results = (outs TFL_BoolTensor:$output);
 
@@ -827,7 +862,7 @@ convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imag
   }];
 
   let arguments = (ins
-      TensorOf<[F32, QI8, QUI8]>:$input,
+      TFL_TensorOf<[F32, QI8, QUI8]>:$input,
       I32Attr:$radius,
       F32Attr:$bias,
       F32Attr:$alpha,
@@ -835,7 +870,7 @@ convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imag
   );
 
   let results = (outs
-    TensorOf<[F32, QI8, QUI8]>:$output
+    TFL_TensorOf<[F32, QI8, QUI8]>:$output
   );
 
   let hasOptions = 1;
@@ -881,11 +916,11 @@ def TFL_MatrixDiagOp : TFL_Op<"matrix_diag", [
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I8, I64, I32, TFL_Uint8]>:$diagonal
+    TFL_TensorOf<[F32, I8, I64, I32, TFL_Uint8]>:$diagonal
   );
 
   let results = (outs
-    TensorOf<[F32, I8, I64, I32, TFL_Uint8]>:$output
+    TFL_TensorOf<[F32, I8, I64, I32, TFL_Uint8]>:$output
   );
 
   let hasOptions = 0;
@@ -958,14 +993,14 @@ using the `tf.gather operation`.  For example:
   let arguments = (ins
     TFL_FpTensor:$boxes,
     TFL_FpTensor:$scores,
-    I32Tensor:$max_output_size,
+    TFL_I32Tensor:$max_output_size,
     TFL_FpTensor:$iou_threshold,
     TFL_FpTensor:$score_threshold
   );
 
   let results = (outs
-    I32Tensor:$selected_indices,
-    I32Tensor:$valid_outputs
+    TFL_I32Tensor:$selected_indices,
+    TFL_I32Tensor:$valid_outputs
   );
 }
 
@@ -1012,16 +1047,16 @@ larger than 0.
   let arguments = (ins
     TFL_FpTensor:$boxes,
     TFL_FpTensor:$scores,
-    I32Tensor:$max_output_size,
+    TFL_I32Tensor:$max_output_size,
     TFL_FpTensor:$iou_threshold,
     TFL_FpTensor:$score_threshold,
     TFL_FpTensor:$soft_nms_sigma
   );
 
   let results = (outs
-    I32Tensor:$selected_indices,
+    TFL_I32Tensor:$selected_indices,
     TFL_FpTensor:$selected_scores,
-    I32Tensor:$valid_outputs
+    TFL_I32Tensor:$valid_outputs
   );
 }
 
@@ -1105,11 +1140,11 @@ def TFL_EmbeddingLookupOp: TFL_Op<"embedding_lookup",
   }];
 
   let arguments = (ins
-    TensorOf<[I32]>:$lookup,
-    TensorOf<[F32, I8, TFL_Uint8]>:$value
+    TFL_TensorOf<[I32]>:$lookup,
+    TFL_TensorOf<[F32, I8, TFL_Uint8]>:$value
    );
 
-  let results = (outs TensorOf<[F32, I8, TFL_Uint8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, I8, TFL_Uint8]>:$output);
 }
 
 def TFL_EqualOp: TFL_Op<"equal", [Commutative, ResultsBroadcastableShape,
@@ -1123,8 +1158,8 @@ def TFL_EqualOp: TFL_Op<"equal", [Commutative, ResultsBroadcastableShape,
 
   let arguments = (
     ins
-    TensorOf<[I1, F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$x,
-    TensorOf<[I1, F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$y
+    TFL_TensorOf<[I1, F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$x,
+    TFL_TensorOf<[I1, F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$y
   );
 
   let results = (outs TFL_BoolTensor:$output);
@@ -1284,10 +1319,10 @@ def TFL_FloorModOp : TFL_Op<"floor_mod", [ResultsBroadcastableShape, NoSideEffec
   }];
 
   let arguments = (
-    ins TensorOf<[I32, I64, F32]>:$lhs,
-    TensorOf<[I32, I64, F32]>:$rhs);
+    ins TFL_TensorOf<[I32, I64, F32]>:$lhs,
+    TFL_TensorOf<[I32, I64, F32]>:$rhs);
 
-  let results = (outs TensorOf<[I32, I64, F32]>:$output);
+  let results = (outs TFL_TensorOf<[I32, I64, F32]>:$output);
 
   let builders = [TFL_BroadcastableBinaryBuilder];
 }
@@ -1322,9 +1357,9 @@ def TFL_HardSwishOp: TFL_Op<"hard_swish", [NoSideEffect,
     element-wise.
   }];
 
-  let arguments = (ins TensorOf<[F32, QUI8, QI8]>:$input);
+  let arguments = (ins TFL_TensorOf<[F32, QUI8, QI8]>:$input);
 
-  let results = (outs TensorOf<[F32, QUI8, QI8]>:$out);
+  let results = (outs TFL_TensorOf<[F32, QUI8, QI8]>:$out);
 
   let hasOptions = 0;
 }
@@ -1342,11 +1377,11 @@ def TFL_L2NormalizationOp : TFL_Op<"l2_normalization", [NoSideEffect,
   }];
 
   let arguments = (ins
-    TensorOf<[F32, QUI8, QI8, QUI16, QI16, I8]>:$input,
+    TFL_TensorOf<[F32, QUI8, QI8, QUI16, QI16, I8]>:$input,
     TFL_AFAttr:$fused_activation_function
   );
 
-  let results = (outs TensorOf<[F32, QUI8, QI8, QUI16, QI16, I8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, QUI8, QI8, QUI16, QI16, I8]>:$output);
 
   let hasOptions = 1;
 
@@ -1403,10 +1438,10 @@ def TFL_LogicalAndOp : TFL_Op<"logical_and", [NoSideEffect]> {
   }];
 
   let arguments = (
-    ins I1Tensor:$lhs,
-    I1Tensor:$rhs);
+    ins TFL_BoolTensor:$lhs,
+    TFL_BoolTensor:$rhs);
 
-  let results = (outs I1Tensor:$output);
+  let results = (outs TFL_BoolTensor:$output);
 
   let parser = [{ return mlir::impl::parseOneResultSameOperandTypeOp(parser, result); }];
 
@@ -1420,9 +1455,9 @@ def TFL_LogicalNotOp : TFL_Op<"logical_not", [NoSideEffect, NoQuantizableResult]
     Element-wise logical NOT operation.
   }];
 
-  let arguments = (ins I1Tensor:$lhs);
+  let arguments = (ins TFL_BoolTensor:$lhs);
 
-  let results = (outs I1Tensor:$output);
+  let results = (outs TFL_BoolTensor:$output);
 }
 
 def TFL_LogicalOrOp : TFL_Op<"logical_or", [NoSideEffect]> {
@@ -1433,10 +1468,10 @@ def TFL_LogicalOrOp : TFL_Op<"logical_or", [NoSideEffect]> {
   }];
 
   let arguments = (
-    ins I1Tensor:$lhs,
-    I1Tensor:$rhs);
+    ins TFL_BoolTensor:$lhs,
+    TFL_BoolTensor:$rhs);
 
-  let results = (outs I1Tensor:$output);
+  let results = (outs TFL_BoolTensor:$output);
 
   let parser = [{ return mlir::impl::parseOneResultSameOperandTypeOp(parser, result); }];
 
@@ -1456,9 +1491,9 @@ def TFL_LogisticOp: TFL_Op<"logistic", [
     Computes element-wise Sigmoid of input
   }];
 
-  let arguments = (ins TensorOf<[AnyFloat, QI8, QUI8, QI16, QUI16]>:$x);
+  let arguments = (ins TFL_TensorOf<[AnyFloat, QI8, QUI8, QI16, QUI16]>:$x);
 
-  let results = (outs TensorOf<[AnyFloat, QI8, QUI8, QI16, QUI16]>:$y);
+  let results = (outs TFL_TensorOf<[AnyFloat, QI8, QUI8, QI16, QUI16]>:$y);
 }
 
 def TFL_LogOp: TFL_Op<"log", [
@@ -1608,12 +1643,12 @@ def TFL_MaximumOp : TFL_Op<"maximum", [
   }];
 
   let arguments = (
-    ins TensorOf<[AnyFloat, TFL_Int32Or64, QI8, QUI8]>:$lhs,
-    TensorOf<[AnyFloat, TFL_Int32Or64, QI8, QUI8]>:$rhs
+    ins TFL_TensorOf<[AnyFloat, TFL_Int32Or64, QI8, QUI8]>:$lhs,
+    TFL_TensorOf<[AnyFloat, TFL_Int32Or64, QI8, QUI8]>:$rhs
   );
 
   let results = (outs
-    TensorOf<[AnyFloat, TFL_Int32Or64, QI8, QUI8]>:$max
+    TFL_TensorOf<[AnyFloat, TFL_Int32Or64, QI8, QUI8]>:$max
   );
 
   let builders = [TFL_BroadcastableBinaryBuilder];
@@ -1633,13 +1668,13 @@ def TFL_MeanOp : TFL_Op<"mean", [NoSideEffect, SameOperandsAndResultsScale]> {
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I8, I32, I64, QI8, QUI8, TFL_Uint8]>:$input,
-    TensorOf<[I32, I64]>:$axis,
+    TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8, TFL_Uint8]>:$input,
+    TFL_TensorOf<[I32, I64]>:$axis,
     BoolAttr:$keep_dims
   );
 
   let results = (outs
-    TensorOf<[F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$output);
+    TFL_TensorOf<[F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$output);
 
   let hasOptions = 1;
   let customOption = "ReducerOptions";
@@ -1658,16 +1693,16 @@ def TFL_OneHotOp : TFL_Op<"one_hot", [NoSideEffect]> {
   }];
 
   let arguments = (ins
-    TensorOf<[I32, I64]>:$indices,
-    I32Tensor:$depth,
-    TensorOf<[F32, I32, I64, I1]>:$on_value,
-    TensorOf<[F32, I32, I64, I1]>:$off_value,
+    TFL_TensorOf<[I32, I64]>:$indices,
+    TFL_I32Tensor:$depth,
+    TFL_TensorOf<[F32, I32, I64, I1]>:$on_value,
+    TFL_TensorOf<[F32, I32, I64, I1]>:$off_value,
 
     I32Attr:$axis
   );
 
   let results = (outs
-    TensorOf<[F32, I32, I64, I1]>:$output
+    TFL_TensorOf<[F32, I32, I64, I1]>:$output
   );
 
   let hasOptions = 1;
@@ -1681,11 +1716,11 @@ Rounds the values of a tensor to the nearest integer, element-wise.
   }];
 
   let arguments = (ins
-    TensorOf<[F32]>:$x
+    TFL_TensorOf<[F32]>:$x
   );
 
   let results = (outs
-    TensorOf<[F32]>:$y
+    TFL_TensorOf<[F32]>:$y
   );
 }
 
@@ -1729,7 +1764,7 @@ def TFL_SumOp: TFL_Op<"sum", [NoSideEffect]> {
 
   let arguments = (ins
     AnyTensor:$input,
-    I32Tensor:$axes,
+    TFL_I32Tensor:$axes,
     BoolAttr:$keep_dims
   );
 
@@ -1749,7 +1784,7 @@ def TFL_ReduceMinOp: TFL_Op<"reduce_min", [
 
   let arguments = (ins
     AnyTensor:$input,
-    I32Tensor:$axes,
+    TFL_I32Tensor:$axes,
     BoolAttr:$keep_dims
   );
 
@@ -1769,7 +1804,7 @@ def TFL_ReduceMaxOp: TFL_Op<"reduce_max", [
 
   let arguments = (ins
     AnyTensor:$input,
-    I32Tensor:$axes,
+    TFL_I32Tensor:$axes,
     BoolAttr:$keep_dims
   );
 
@@ -1787,8 +1822,8 @@ def TFL_ReduceProdOp: TFL_Op<"reduce_prod", [NoSideEffect]> {
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I8, I32, I64]>:$input,
-    I32Tensor:$axes,
+    TFL_TensorOf<[F32, I8, I32, I64]>:$input,
+    TFL_I32Tensor:$axes,
     BoolAttr:$keep_dims
   );
 
@@ -1807,12 +1842,12 @@ def TFL_MinimumOp : TFL_Op<"minimum", [
   }];
 
   let arguments = (
-    ins TensorOf<[AnyFloat, TFL_Int32Or64, QI8, QUI8]>:$lhs,
-    TensorOf<[AnyFloat, TFL_Int32Or64, QI8, QUI8]>:$rhs
+    ins TFL_TensorOf<[AnyFloat, TFL_Int32Or64, QI8, QUI8]>:$lhs,
+    TFL_TensorOf<[AnyFloat, TFL_Int32Or64, QI8, QUI8]>:$rhs
   );
 
   let results = (outs
-    TensorOf<[AnyFloat, TFL_Int32Or64, QI8, QUI8]>:$min
+    TFL_TensorOf<[AnyFloat, TFL_Int32Or64, QI8, QUI8]>:$min
   );
 
   let builders = [TFL_BroadcastableBinaryBuilder];
@@ -1892,14 +1927,14 @@ def TFL_PackOp : TFL_Op<"pack", [NoSideEffect, SameOperandsAndResultsScale]> {
   }];
 
   let arguments = (ins
-    Variadic<TensorOf<[F32, I8, I16, I32, I64, QI8, QUI8, QI16]>>:$values,
+    TFL_VariadicTensorOf<[F32, I8, I16, I32, I64, QI8, QUI8, QI16]>:$values,
 
     I32Attr:$values_count,
     I32Attr:$axis
   );
 
   let results = (outs
-    TensorOf<[F32, I8, I16, I32, I64, QI8, QUI8, QI16]>:$output
+    TFL_TensorOf<[F32, I8, I16, I32, I64, QI8, QUI8, QI16]>:$output
   );
 
   let verifier = [{ return Verify(*this); }];
@@ -1941,11 +1976,10 @@ def TFL_PadOp : TFL_Op<"pad", [
     ```
   }];
 
-  let arguments = (
-    ins TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$input,
+  let arguments = (ins TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$input,
     TFL_I32OrI64Tensor:$padding);
 
-  let results = (outs TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$output);
 
   let hasOptions = 1;
 }
@@ -1988,11 +2022,11 @@ def TFL_PadV2Op : TFL_Op<"padv2", [
   }];
 
   let arguments = (
-    ins TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$input,
+    ins TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$input,
     TFL_I32OrI64Tensor:$padding,
-    TensorOf<[F32, I8, I32, I64]>:$constant_values);
+    TFL_TensorOf<[F32, I8, I32, I64]>:$constant_values);
 
-  let results = (outs TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$output);
 
   let hasOptions = 1;
 }
@@ -2030,11 +2064,11 @@ def TFL_PReluOp : TFL_Op<"prelu", [NoSideEffect]> {
   }];
 
   let arguments = (
-    ins TensorOf<[F32, QUI8]>:$input,
-    TensorOf<[F32, QUI8]>:$alpha
+    ins TFL_TensorOf<[F32, QUI8]>:$input,
+    TFL_TensorOf<[F32, QUI8]>:$alpha
   );
 
-  let results = (outs TensorOf<[F32, QUI8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, QUI8]>:$output);
 
   let verifier = [{ return Verify(*this); }];
 }
@@ -2062,9 +2096,9 @@ def TFL_ReluOp: TFL_Op<"relu", [NoSideEffect,
       x -> max(0, x)
   }];
 
-  let arguments = (ins TensorOf<[F32, QUI8, I8]>:$x);
+  let arguments = (ins TFL_TensorOf<[F32, QUI8, I8]>:$x);
 
-  let results = (outs TensorOf<[F32, QUI8, I8]>:$y);
+  let results = (outs TFL_TensorOf<[F32, QUI8, I8]>:$y);
 }
 
 def TFL_Relu6Op: TFL_Op<"relu6", [NoSideEffect,
@@ -2077,9 +2111,9 @@ def TFL_Relu6Op: TFL_Op<"relu6", [NoSideEffect,
       x -> max(0, min(6, x))
   }];
 
-  let arguments = (ins TensorOf<[F32, QUI8, I8]>:$x);
+  let arguments = (ins TFL_TensorOf<[F32, QUI8, I8]>:$x);
 
-  let results = (outs TensorOf<[F32, QUI8, I8]>:$y);
+  let results = (outs TFL_TensorOf<[F32, QUI8, I8]>:$y);
 }
 
 def TFL_Relu1Op: TFL_Op<"relu_n1_to_1", [NoSideEffect,
@@ -2092,9 +2126,9 @@ def TFL_Relu1Op: TFL_Op<"relu_n1_to_1", [NoSideEffect,
       x -> max(-1, min(1, x))
   }];
 
-  let arguments = (ins TensorOf<[F32, QUI8, I8]>:$x);
+  let arguments = (ins TFL_TensorOf<[F32, QUI8, I8]>:$x);
 
-  let results = (outs TensorOf<[F32, QUI8, I8]>:$y);
+  let results = (outs TFL_TensorOf<[F32, QUI8, I8]>:$y);
 }
 
 def TFL_ReshapeOp: TFL_Op<"reshape", [
@@ -2108,7 +2142,7 @@ def TFL_ReshapeOp: TFL_Op<"reshape", [
 
   let arguments = (
     ins AnyTensor:$input,
-    I32Tensor:$shape);
+    TFL_I32Tensor:$shape);
 
   let results = (outs AnyTensor:$output);
   let hasCanonicalizer = 0b1;
@@ -2132,7 +2166,7 @@ slice `i`, with the first `seq_lengths[i]` slices along dimension
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I16, I32, I64, TFL_Uint8]>:$input,
+    TFL_TensorOf<[F32, I16, I32, I64, TFL_Uint8]>:$input,
     TFL_I32OrI64Tensor:$seq_lengths,
 
     I32Attr:$seq_dim,
@@ -2140,7 +2174,7 @@ slice `i`, with the first `seq_lengths[i]` slices along dimension
   );
 
   let results = (outs
-    TensorOf<[F32, I16, I32, I64, TFL_Uint8]>:$output
+    TFL_TensorOf<[F32, I16, I32, I64, TFL_Uint8]>:$output
   );
 
   let hasOptions = 1;
@@ -2224,12 +2258,12 @@ def TFL_ReverseV2Op: TFL_Op<"reverse_v2",
 
   let arguments = (
     ins
-    TensorOf<[F32, I16, I32, I64, TFL_Uint8, I1]>:$input,
-    TensorOf<[I32, I64]>:$axis
+    TFL_TensorOf<[F32, I16, I32, I64, TFL_Uint8, I1]>:$input,
+    TFL_TensorOf<[I32, I64]>:$axis
   );
 
   let results = (outs
-  TensorOf<[F32, I16, I32, I64, TFL_Uint8, I1]>:$output
+  TFL_TensorOf<[F32, I16, I32, I64, TFL_Uint8, I1]>:$output
   );
 }
 
@@ -2251,8 +2285,8 @@ def TFL_SelectOp : TFL_Op<"select", [NoSideEffect,
 
   let arguments = (ins
     TFL_BoolTensor:$condition,
-    TensorOf<[F32, I1, I8, I16, I32, I64, TFL_Uint8]>:$x,
-    TensorOf<[F32, I1, I8, I16, I32, I64, TFL_Uint8]>:$y);
+    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, TFL_Uint8]>:$x,
+    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, TFL_Uint8]>:$y);
   let results = (outs AnyTensor:$output);
 
   // TODO(jpienaar): autogenerate this.
@@ -2280,8 +2314,8 @@ def TFL_SelectV2Op : TFL_Op<"select_v2", [NoSideEffect]> {
 
   let arguments = (ins
     TFL_BoolTensor:$condition,
-    TensorOf<[F32, I1, I8, I16, I32, I64, TFL_Uint8]>:$x,
-    TensorOf<[F32, I1, I8, I16, I32, I64, TFL_Uint8]>:$y);
+    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, TFL_Uint8]>:$x,
+    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, TFL_Uint8]>:$y);
   let results = (outs AnyTensor:$output);
 
   let builders = [OpBuilder<"Builder *builder, OperationState &result, "
@@ -2428,9 +2462,9 @@ def TFL_TanhOp: TFL_Op<"tanh", [
     Computes element-wise Hyperbolic tangent of input
   }];
 
-  let arguments = (ins TensorOf<[F32, I16, I8, QI8, QUI8, QI16, QUI16, TFL_Uint8]>:$x);
+  let arguments = (ins TFL_TensorOf<[F32, I16, I8, QI8, QUI8, QI16, QUI16, TFL_Uint8]>:$x);
 
-  let results = (outs TensorOf<[F32, I16, I8, QI8, QUI8, QI16, QUI16, TFL_Uint8]>:$y);
+  let results = (outs TFL_TensorOf<[F32, I16, I8, QI8, QUI8, QI16, QUI16, TFL_Uint8]>:$y);
 }
 
 def TFL_TileOp: TFL_Op<"tile", [NoSideEffect, SameOperandsAndResultsScale,
@@ -2448,11 +2482,11 @@ def TFL_TileOp: TFL_Op<"tile", [NoSideEffect, SameOperandsAndResultsScale,
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I1, I32, I64, TFL_Uint8, QUI8]>:$input,
+    TFL_TensorOf<[F32, I1, I32, I64, TFL_Uint8, QUI8]>:$input,
     TFL_I32OrI64Tensor:$multiples);
 
   let results = (outs
-    TensorOf<[F32, I1, I32, I64, TFL_Uint8, QUI8]>:$output);
+    TFL_TensorOf<[F32, I1, I32, I64, TFL_Uint8, QUI8]>:$output);
 
   let hasOptions = 0;
 }
@@ -2472,12 +2506,12 @@ def TFL_TopKV2Op: TFL_Op<"topk_v2", [NoSideEffect, TFL_OperandHasRank<1,0>,
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I8, I32, I64, TFL_Uint8, QI8, QUI8]>:$input,
-    I32Tensor:$k);
+    TFL_TensorOf<[F32, I8, I32, I64, TFL_Uint8, QI8, QUI8]>:$input,
+    TFL_I32Tensor:$k);
 
   let results = (outs
-    TensorOf<[F32, I8, I32, I64, TFL_Uint8, QI8, QUI8]>:$values,
-    I32Tensor:$indices);
+    TFL_TensorOf<[F32, I8, I32, I64, TFL_Uint8, QI8, QUI8]>:$values,
+    TFL_I32Tensor:$indices);
 
   let builders = [OpBuilder<"Builder *builder, OperationState &result, "
                             "Value input, Value k",
@@ -2503,7 +2537,7 @@ def TFL_TransposeOp : TFL_Op<"transpose",
 
   let arguments = (
     ins AnyTensor:$x,
-    TensorOf<[I32]>:$perm
+    TFL_TensorOf<[I32]>:$perm
   );
 
   let results = (outs
@@ -2536,14 +2570,14 @@ def TFL_UnpackOp : TFL_Op<"unpack", [NoSideEffect, SameOperandsAndResultsScale]>
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I1, I8, I32, QI8, QUI8]>:$input,
+    TFL_TensorOf<[F32, I1, I8, I32, QI8, QUI8]>:$input,
 
     I32Attr:$num,
     I32Attr:$axis
   );
 
   let results = (outs
-    Variadic<TensorOf<[F32, I1, I8, I32, QI8, QUI8]>>:$outputs
+    TFL_VariadicTensorOf<[F32, I1, I8, I32, QI8, QUI8]>:$outputs
   );
 
   let verifier = [{ return Verify(*this); }];
@@ -2578,13 +2612,13 @@ def TFL_BatchToSpaceNdOp: TFL_Op<"batch_to_space_nd", [
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$input,
-    TensorOf<[I32]>:$block_shape,
-    TensorOf<[I32]>:$indices
+    TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$input,
+    TFL_TensorOf<[I32]>:$block_shape,
+    TFL_TensorOf<[I32]>:$indices
   );
 
   let results = (outs
-    TensorOf<[F32, I16, I32, I64, QI8, QUI8]>:$output
+    TFL_TensorOf<[F32, I16, I32, I64, QI8, QUI8]>:$output
   );
 }
 
@@ -2601,13 +2635,13 @@ def TFL_SpaceToBatchNdOp: TFL_Op<"space_to_batch_nd", [
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$input,
-    TensorOf<[I32]>:$block_shape,
-    TensorOf<[I32]>:$paddings
+    TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$input,
+    TFL_TensorOf<[I32]>:$block_shape,
+    TFL_TensorOf<[I32]>:$paddings
   );
 
   let results = (outs
-    TensorOf<[F32, I16, I32, I64, QI8, QUI8]>:$output
+    TFL_TensorOf<[F32, I16, I32, I64, QI8, QUI8]>:$output
   );
 }
 
@@ -2627,12 +2661,12 @@ def TFL_SpaceToDepthOp: TFL_Op<"space_to_depth", [
    }];
 
   let arguments = (ins
-    TensorOf<[F32, I8, I32, I64, TFL_Uint8, QUI8]>:$input,
+    TFL_TensorOf<[F32, I8, I32, I64, TFL_Uint8, QUI8]>:$input,
     I32Attr:$block_size
   );
 
   let results = (outs
-    TensorOf<[F32, I8, I32, I64, TFL_Uint8, QUI8]>:$output
+    TFL_TensorOf<[F32, I8, I32, I64, TFL_Uint8, QUI8]>:$output
   );
 
   let hasOptions = 1;
@@ -2656,12 +2690,12 @@ def TFL_DepthToSpaceOp: TFL_Op<"depth_to_space", [
    }];
 
   let arguments = (ins
-    TensorOf<[F32, I8, I32, I64, TFL_Uint8, TFL_Quint8, QUI8]>:$input,
+    TFL_TensorOf<[F32, I8, I32, I64, TFL_Uint8, TFL_Quint8, QUI8]>:$input,
     I32Attr:$block_size
   );
 
   let results = (outs
-    TensorOf<[F32, I8, I32, I64, TFL_Uint8, TFL_Quint8, QUI8]>:$output
+    TFL_TensorOf<[F32, I8, I32, I64, TFL_Uint8, TFL_Quint8, QUI8]>:$output
   );
 
   let hasOptions = 1;
@@ -2680,13 +2714,13 @@ def TFL_SplitOp : TFL_Op<"split", [
   }];
 
   let arguments = (ins
-    TensorOf<[I32]>:$split_dim,
-    TensorOf<[F32, I16, I32, I64, QI8, QUI8, QI16]>:$value,
+    TFL_TensorOf<[I32]>:$split_dim,
+    TFL_TensorOf<[F32, I16, I32, I64, QI8, QUI8, QI16]>:$value,
     PositiveI32Attr:$num_splits
   );
 
   let results = (outs
-    Variadic<TensorOf<[F32, I16, I32, I64, QI8, QUI8, QI16]>>:$outputs
+    TFL_VariadicTensorOf<[F32, I16, I32, I64, QI8, QUI8, QI16]>:$outputs
   );
 
   let verifier = [{ return Verify(*this); }];
@@ -2704,14 +2738,14 @@ def TFL_SplitVOp : TFL_Op<"split_v", [NoSideEffect, SameOperandsAndResultsScale]
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I16, I32, I64, QI8, QUI8, QI16]>:$value,
-    1DTensorOf<[I32]>:$size_splits,
-    0DTensorOf<[I32]>:$split_dim,
+    TFL_TensorOf<[F32, I16, I32, I64, QI8, QUI8, QI16]>:$value,
+    TFL_1DTensorOf<[I32], [I32]>:$size_splits,
+    TFL_0DTensorOf<[I32], [I32]>:$split_dim,
     PositiveI32Attr:$num_splits
   );
 
   let results = (outs
-    Variadic<TensorOf<[F32, I16, I32, I64, QI8, QUI8, QI16]>>:$outputs
+    TFL_VariadicTensorOf<[F32, I16, I32, I64, QI8, QUI8, QI16]>:$outputs
   );
 
   let verifier = [{ return Verify(*this); }];
@@ -2729,14 +2763,14 @@ def TFL_ResizeBilinearOp: TFL_Op<"resize_bilinear", [
 
   let arguments = (ins
     // TODO(ycling): Support quantized types.
-    TensorOf<[F32, I32, QI8, QUI8]>:$input,
-    TensorOf<[I32]>:$size,
+    TFL_TensorOf<[F32, I32, QI8, QUI8]>:$input,
+    TFL_TensorOf<[I32]>:$size,
     BoolAttr:$align_corners,
     DefaultValuedAttr<BoolAttr, "false">:$half_pixel_centers
   );
 
   let results = (outs
-    TensorOf<[F32, QI8, QUI8]>:$output
+    TFL_TensorOf<[F32, QI8, QUI8]>:$output
   );
 
   let hasOptions = 1;
@@ -2752,13 +2786,13 @@ def TFL_ResizeNearestNeighborOp : TFL_Op<"resize_nearest_neighbor",
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I8, TFL_Uint8, QUI8, QI8]>:$input,
-    TensorOf<[I32]>:$size,
+    TFL_TensorOf<[F32, I8, TFL_Uint8, QUI8, QI8]>:$input,
+    TFL_TensorOf<[I32]>:$size,
     BoolAttr:$align_corners
   );
 
   let results = (outs
-    TensorOf<[F32, I8, TFL_Uint8, QUI8, QI8]>:$output
+    TFL_TensorOf<[F32, I8, TFL_Uint8, QUI8, QI8]>:$output
   );
 
   let hasOptions = 1;
@@ -2792,12 +2826,12 @@ are checked during execution.
   let arguments = (ins
     TFL_I32OrI64Tensor:$sparse_indices,
     TFL_I32OrI64Tensor:$output_shape,
-    TensorOf<[I32, I64, I8, TFL_Uint8, F32]>:$sparse_values,
-    TensorOf<[I32, I64, I8, TFL_Uint8, F32]>:$default_value
+    TFL_TensorOf<[I32, I64, I8, TFL_Uint8, F32]>:$sparse_values,
+    TFL_TensorOf<[I32, I64, I8, TFL_Uint8, F32]>:$default_value
   );
 
   let results = (outs
-    TensorOf<[I32, I64, I8, TFL_Uint8, F32]>:$dense
+    TFL_TensorOf<[I32, I64, I8, TFL_Uint8, F32]>:$dense
   );
 }
 
@@ -2815,10 +2849,10 @@ def TFL_StridedSliceOp: TFL_Op<"strided_slice",
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I32, I64, I8, QI8, QUI8, I1, TFL_Quint8, TFL_Uint8]>:$input,
-    TensorOf<[I32]>:$begin,
-    TensorOf<[I32]>:$end,
-    TensorOf<[I32]>:$strides,
+    TFL_TensorOf<[F32, I32, I64, I8, QI8, QUI8, I1, TFL_Quint8, TFL_Uint8]>:$input,
+    TFL_TensorOf<[I32]>:$begin,
+    TFL_TensorOf<[I32]>:$end,
+    TFL_TensorOf<[I32]>:$strides,
 
     I32Attr:$begin_mask,
     I32Attr:$end_mask,
@@ -2828,7 +2862,7 @@ def TFL_StridedSliceOp: TFL_Op<"strided_slice",
   );
 
   let results = (outs
-    TensorOf<[F32, I32, I64, I8, QI8, QUI8, I1, TFL_Quint8, TFL_Uint8]>:$output
+    TFL_TensorOf<[F32, I32, I64, I8, QI8, QUI8, I1, TFL_Quint8, TFL_Uint8]>:$output
   );
 
   let hasOptions = 1;
@@ -2843,10 +2877,10 @@ def TFL_CastOp : TFL_Op<"cast", [
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I1, I32, I64, TFL_Quint8, TFL_Uint8, Complex<F<32>>]>:$input
+    TFL_TensorOf<[F32, I1, I32, I64, TFL_Quint8, TFL_Uint8, Complex<F<32>>]>:$input
   );
 
-  let results = (outs TensorOf<[F32, I1, I32, I64, Complex<F<32>>]>:$output);
+  let results = (outs TFL_TensorOf<[F32, I1, I32, I64, Complex<F<32>>]>:$output);
 
   // TFLite's cast op does not utilize CastOptions, instead derives types
   // from the TfLiteTensors.
@@ -2878,13 +2912,13 @@ def TFL_MirrorPadOp: TFL_Op<"mirror_pad", [
 
   let arguments = (ins
     // TODO: add uint8 support when ready.
-    TensorOf<[F32, I32, I64]>:$input,
-    TensorOf<[I32, I64]>:$pad,
+    TFL_TensorOf<[F32, I32, I64]>:$input,
+    TFL_TensorOf<[I32, I64]>:$pad,
     TFL_MirrorPaddingAttr:$mode
   );
 
   let results = (outs
-    TensorOf<[F32, I32, I64]>:$output
+    TFL_TensorOf<[F32, I32, I64]>:$output
   );
 
   let hasOptions = 1;
@@ -2902,12 +2936,12 @@ in the unique output `y`. In other words:
 
   let arguments = (ins
     // TODO: add uint8 support after quantize support.
-    TensorOf<[I8, I16, I32, I64, F32]>:$input
+    TFL_TensorOf<[I8, I16, I32, I64, F32]>:$input
   );
 
   let results = (outs
-    TensorOf<[I8, I16, I32, I64, F32]>:$output,
-    TensorOf<[I32, I64]>:$idx
+    TFL_TensorOf<[I8, I16, I32, I64, F32]>:$output,
+    TFL_TensorOf<[I32, I64]>:$idx
   );
 
   DerivedTFLiteTypeAttr idx_out_type = DerivedTFLiteTypeAttr<[{
@@ -3107,11 +3141,11 @@ def TFL_BasicLSTMOp : TFL_Op<"basic_lstm", [NoSideEffect,
   }];
 
   let arguments = (
-    ins TensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$data_input,
-    TensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$prev_activ_input,
-    TensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$weights_input,
-    TensorOf<[F32, QI32, QUI32]>:$biases_input,
-    TensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$prev_state_input,
+    ins TFL_TensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$data_input,
+    TFL_TensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$prev_activ_input,
+    TFL_TensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$weights_input,
+    TFL_TensorOf<[F32, QI32, QUI32]>:$biases_input,
+    TFL_TensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$prev_state_input,
 
     // Attributes
     DefaultValuedAttr<TFL_AFAttr, "TANH">:$fused_activation_function,
@@ -3125,10 +3159,10 @@ def TFL_BasicLSTMOp : TFL_Op<"basic_lstm", [NoSideEffect,
 
   let hasOptions = 1;
 
-  let results = (outs 2DTensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$activ_output,
-                      2DTensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$state_output,
-                      2DTensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$concat_temp,
-                      2DTensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$activ_temp);
+  let results = (outs TFL_2DTensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$activ_output,
+                      TFL_2DTensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$state_output,
+                      TFL_2DTensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$concat_temp,
+                      TFL_2DTensorOf<[F32, I8, QI8, QUI8, QI16, QUI16]>:$activ_temp);
 }
 
 // This is the FULL kernel type LSTM op.
@@ -3161,19 +3195,19 @@ Ba et al. “Layer Normalization”
   }];
 
   let arguments = (
-    ins TensorOf<[F32]>:$input,
+    ins TFL_TensorOf<[F32]>:$input,
 
     // Weights
     TFL_TensorOfOrNone<[F32, I8]>:$input_to_input_weights,
-    TensorOf<[F32, I8]>:$input_to_forget_weights,
-    TensorOf<[F32, I8]>:$input_to_cell_weights,
-    TensorOf<[F32, I8]>:$input_to_output_weights,
+    TFL_TensorOf<[F32, I8]>:$input_to_forget_weights,
+    TFL_TensorOf<[F32, I8]>:$input_to_cell_weights,
+    TFL_TensorOf<[F32, I8]>:$input_to_output_weights,
 
     // Recurrent weights
     TFL_TensorOfOrNone<[F32, I8]>:$recurrent_to_input_weights,
-    TensorOf<[F32, I8]>:$recurrent_to_forget_weights,
-    TensorOf<[F32, I8]>:$recurrent_to_cell_weights,
-    TensorOf<[F32, I8]>:$recurrent_to_output_weights,
+    TFL_TensorOf<[F32, I8]>:$recurrent_to_forget_weights,
+    TFL_TensorOf<[F32, I8]>:$recurrent_to_cell_weights,
+    TFL_TensorOf<[F32, I8]>:$recurrent_to_output_weights,
 
     // Cell weights
     TFL_TensorOfOrNone<[F32, I8]>:$cell_to_input_weights,
@@ -3184,9 +3218,9 @@ Ba et al. “Layer Normalization”
 
     // Bias
     TFL_TensorOfOrNone<[F32]>:$input_gate_bias,
-    TensorOf<[F32]>:$forget_gate_bias,
-    TensorOf<[F32]>:$cell_bias,
-    TensorOf<[F32]>:$output_gate_bias,
+    TFL_TensorOf<[F32]>:$forget_gate_bias,
+    TFL_TensorOf<[F32]>:$cell_bias,
+    TFL_TensorOf<[F32]>:$output_gate_bias,
 
     // Projection weight and bias
     TFL_TensorOfOrNone<[F32, I8]>:$projection_weights,
@@ -3253,19 +3287,19 @@ def TFL_UnidirectionalSequenceLSTMOp :
   }];
 
   let arguments = (
-    ins TensorOf<[F32, I8]>:$input,
+    ins TFL_TensorOf<[F32, I8]>:$input,
 
     // Weights
     TFL_TensorOfOrNone<[F32, I8]>:$input_to_input_weights,
-    TensorOf<[F32, I8]>:$input_to_forget_weights,
-    TensorOf<[F32, I8]>:$input_to_cell_weights,
-    TensorOf<[F32, I8]>:$input_to_output_weights,
+    TFL_TensorOf<[F32, I8]>:$input_to_forget_weights,
+    TFL_TensorOf<[F32, I8]>:$input_to_cell_weights,
+    TFL_TensorOf<[F32, I8]>:$input_to_output_weights,
 
     // Recurrent weights
     TFL_TensorOfOrNone<[F32, I8]>:$recurrent_to_input_weights,
-    TensorOf<[F32, I8]>:$recurrent_to_forget_weights,
-    TensorOf<[F32, I8]>:$recurrent_to_cell_weights,
-    TensorOf<[F32, I8]>:$recurrent_to_output_weights,
+    TFL_TensorOf<[F32, I8]>:$recurrent_to_forget_weights,
+    TFL_TensorOf<[F32, I8]>:$recurrent_to_cell_weights,
+    TFL_TensorOf<[F32, I8]>:$recurrent_to_output_weights,
 
     // Cell weights
     TFL_TensorOfOrNone<[F32, I8]>:$cell_to_input_weights,
@@ -3276,9 +3310,9 @@ def TFL_UnidirectionalSequenceLSTMOp :
 
     // Bias
     TFL_TensorOfOrNone<[F32]>:$input_gate_bias,
-    TensorOf<[F32]>:$forget_gate_bias,
-    TensorOf<[F32]>:$cell_bias,
-    TensorOf<[F32]>:$output_gate_bias,
+    TFL_TensorOf<[F32]>:$forget_gate_bias,
+    TFL_TensorOf<[F32]>:$cell_bias,
+    TFL_TensorOf<[F32]>:$output_gate_bias,
 
     // Projection weight and bias
     TFL_TensorOfOrNone<[F32, I8]>:$projection_weights,
@@ -3339,16 +3373,16 @@ def TFL_UnidirectionalSequenceRNNOp :
   }];
 
   let arguments = (
-    ins TensorOf<[F32, I8]>:$input,
+    ins TFL_TensorOf<[F32, I8]>:$input,
 
     // Weights
-    TensorOf<[F32, I8]>:$input_to_input_weights,
+    TFL_TensorOf<[F32, I8]>:$input_to_input_weights,
 
     // Recurrent weights
-    TensorOf<[F32, I8]>:$recurrent_to_input_weights,
+    TFL_TensorOf<[F32, I8]>:$recurrent_to_input_weights,
 
     // Bias
-    TensorOf<[F32]>:$input_gate_bias,
+    TFL_TensorOf<[F32]>:$input_gate_bias,
 
     // Hidden state.
     TFL_StatefulTensor:$hidden_state,
@@ -3358,7 +3392,7 @@ def TFL_UnidirectionalSequenceRNNOp :
     TFL_AFAttr:$fused_activation_function
   );
 
-  let results = (outs TensorOf<[F32, I8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, I8]>:$output);
 
   let hasOptions = 1;
 
@@ -3385,11 +3419,11 @@ the output tensor can vary depending on how many true values there are in
   }];
 
   let arguments = (ins
-    I1Tensor:$input
+    TFL_BoolTensor:$input
   );
 
   let results = (outs
-    I64Tensor:$index
+    TFL_I64Tensor:$index
   );
 }
 
@@ -3404,8 +3438,8 @@ def TFL_NumericVerifyOp : Op<TFL_Dialect, "NumericVerify", [
   }];
 
   let arguments = (ins
-    TensorOf<[QI8, QUI8, QI16, QUI16]>:$input,
-    TensorOf<[F32]>:$ref,
+    TFL_TensorOf<[QI8, QUI8, QI16, QUI16]>:$input,
+    TFL_TensorOf<[F32]>:$ref,
 
     // Attributes
     DefaultValuedAttr<F32Attr, "0.1">:$tolerance
@@ -3433,13 +3467,13 @@ def TFL_SVDFOp :
   }];
 
   let arguments = (
-    ins TensorOf<[F32, I8]>:$input,
+    ins TFL_TensorOf<[F32, I8]>:$input,
 
     // Feature Weights.
-    TensorOf<[F32, I8]>:$feature_weights,
+    TFL_TensorOf<[F32, I8]>:$feature_weights,
 
     // Time weights
-    TensorOf<[F32, I8]>:$time_weights,
+    TFL_TensorOf<[F32, I8]>:$time_weights,
 
     // Bias
     TFL_TensorOfOrNone<[F32]>:$input_gate_bias,
@@ -3452,7 +3486,7 @@ def TFL_SVDFOp :
     TFL_AFAttr:$fused_activation_function
   );
 
-  let results = (outs TensorOf<[F32, I8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, I8]>:$output);
 
   let hasOptions = 1;
 
@@ -3472,10 +3506,10 @@ def TFL_SegmentSumOp: TFL_Op<"segment_sum", [NoSideEffect]> {
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I32]>:$data,
-    I32Tensor:$segment_ids
+    TFL_TensorOf<[F32, I32]>:$data,
+    TFL_I32Tensor:$segment_ids
   );
-  let results = (outs TensorOf<[F32, I32]>:$output);
+  let results = (outs TFL_TensorOf<[F32, I32]>:$output);
 }
 
 def TFL_YieldOp : Op<TFL_Dialect, "yield", [Terminator]> {
diff --git a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
index e7a6cf7f47d..f2b89aebb44 100644
--- a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
@@ -282,6 +282,7 @@ Status ConvertGraphDefToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
   if (pass_config.legalize_tf_while) {
     pm.addPass(mlir::TFL::CreateWhileOutlinePass());
   }
+  pm.addPass(mlir::TFL::CreateRuntimeTypeVerifyPass());
 
   auto status = ConvertTFExecutorToTFLOrFlatbuffer(
       module.get(), /*export_to_mlir=*/false, emit_builtin_tflite_ops,
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index 6c9836005fc..a1369fe969a 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt -split-input-file -verify-diagnostics %s | FileCheck %s --dump-input-on-failure
+// RUN: tf-opt -split-input-file -verify-diagnostics -tfl-runtime-verify %s | FileCheck %s --dump-input-on-failure
 
 // Unary math ops
 // -----
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
index 648f469e9b0..914156deaae 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/IR/Function.h"  // TF:llvm-project
 #include "mlir/IR/MLIRContext.h"  // TF:llvm-project
 #include "mlir/IR/Module.h"  // TF:llvm-project
+#include "mlir/Pass/Pass.h"  // TF:llvm-project
 #include "mlir/Support/FileUtilities.h"  // TF:llvm-project
 #include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
@@ -32,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/tf_tfl_passes.h"
 #include "tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.h"
 #include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
+#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/lite/model.h"
@@ -182,6 +184,7 @@ int main(int argc, char **argv) {
   pass_config.inline_functions = inline_functions;
 
   tensorflow::AddTFToTFLConversionPasses(pass_config, &pm);
+  pm.addPass(mlir::TFL::CreateRuntimeTypeVerifyPass());
 
   std::string result;
   auto status = tensorflow::ConvertTFExecutorToTFLOrFlatbuffer(
diff --git a/tensorflow/compiler/mlir/lite/transforms/passes.h b/tensorflow/compiler/mlir/lite/transforms/passes.h
index 559bdc6d8e6..b713b474b3d 100644
--- a/tensorflow/compiler/mlir/lite/transforms/passes.h
+++ b/tensorflow/compiler/mlir/lite/transforms/passes.h
@@ -91,6 +91,9 @@ std::unique_ptr<OpPassBase<FuncOp>> CreateLegalizeTFWhilePass();
 // Creates an instance of the TensorFlow Lite dialect WhileOp outline pass.
 std::unique_ptr<OpPassBase<ModuleOp>> CreateWhileOutlinePass();
 
+// Verifies runtime supports types used.
+std::unique_ptr<OpPassBase<FuncOp>> CreateRuntimeTypeVerifyPass();
+
 }  // namespace TFL
 
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/transforms/runtime_type_verify.cc b/tensorflow/compiler/mlir/lite/transforms/runtime_type_verify.cc
new file mode 100644
index 00000000000..2a35701f0e6
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/transforms/runtime_type_verify.cc
@@ -0,0 +1,52 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/IR/OperationSupport.h"  // TF:llvm-project
+#include "mlir/Pass/Pass.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+
+namespace mlir {
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops_interface.h.inc"
+namespace TFL {
+namespace {
+
+// This pass verifies that the operands and results types are supported by
+// TFLite runtime.
+class RuntimeTypeVerifyPass : public mlir::FunctionPass<RuntimeTypeVerifyPass> {
+ public:
+  explicit RuntimeTypeVerifyPass() {}
+
+ private:
+  void runOnFunction() override;
+};
+
+void RuntimeTypeVerifyPass::runOnFunction() {
+  getFunction().walk([&](TflRuntimeVerifyOpInterface op) {
+    if (failed(op.VerifyTflRuntimeTypes(op.getOperation())))
+      signalPassFailure();
+  });
+}
+}  // namespace
+
+// Verifies runtime supports types used.
+std::unique_ptr<OpPassBase<FuncOp>> CreateRuntimeTypeVerifyPass() {
+  return std::make_unique<RuntimeTypeVerifyPass>();
+}
+
+static PassRegistration<RuntimeTypeVerifyPass> pass(
+    "tfl-runtime-verify", "TFLite runtime verification");
+
+}  // namespace TFL
+}  // namespace mlir

From ecd4c8a5a74d34b101b828a0947fa99611f0ddf4 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Tue, 18 Feb 2020 17:08:17 -0800
Subject: [PATCH 194/442] Add a constant fold legalize transform to
 RandomUniform for TFLite converter. This is similar to the pass in toco. This
 is for backward compatibility with toco and should be removed in later
 change.

PiperOrigin-RevId: 295857994
Change-Id: I577ef30036b092fe8391ab93ee37259eb5807fe4
---
 .../compiler/mlir/lite/tests/legalize-tf.mlir | 34 +++++++++++
 .../mlir/lite/transforms/legalize_tf.cc       | 58 +++++++++++++++++--
 2 files changed, 87 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index 570e909e256..662e9fd642e 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -1373,3 +1373,37 @@ func @reciprocal_i64(%arg0: tensor<8xi64>) -> tensor<8xi64> {
 // CHECK:  "tfl.div"(%cst, %arg0) {fused_activation_function = "NONE"} : (tensor<1xi64>, tensor<8xi64>) -> tensor<8xi64>
 // CHECK:  return
 }
+
+func @random_uniform() -> tensor<2x5xf32> {
+  %0 = "tf.Const"() { value = dense<[2, 5]> : tensor<2xi32> } : () -> tensor<2xi32>
+  %1 = "tf.RandomUniform"(%0) { seed = 1, seed2 = 0} : (tensor<2xi32>) -> tensor<2x5xf32>
+  return %1 : tensor<2x5xf32>
+
+  // CHECK-LABEL: random_uniform
+  // CHECK: %[[CST:.*]] = constant dense
+  // CHECK: return %[[CST:.*]] : tensor<2x5xf32>
+}
+
+func @random_uniform_no_fold(%arg0: tensor<2xi32>) -> tensor<2x5xf32> {
+  %1 = "tf.RandomUniform"(%arg0) { seed = 0, seed2 = 0} : (tensor<2xi32>) -> tensor<2x5xf32>
+  return %1 : tensor<2x5xf32>
+
+  // CHECK-LABEL: random_uniform_no_fold
+  // CHECK: %[[RANDOM:.*]] = "tf.RandomUniform"
+}
+
+func @random_uniform_no_fold2(%arg0: tensor<2xi32>) -> tensor<*xf32> {
+  %1 = "tf.RandomUniform"(%arg0) { seed = 1, seed2 = 2} : (tensor<2xi32>) -> tensor<*xf32>
+  return %1 : tensor<*xf32>
+
+  // CHECK-LABEL: random_uniform_no_fold2
+  // CHECK: %[[RANDOM:.*]] = "tf.RandomUniform"
+}
+
+func @random_uniform_no_fold3(%arg0: tensor<2xi32>) -> tensor<*xf64> {
+  %1 = "tf.RandomUniform"(%arg0) { seed = 1, seed2 = 2} : (tensor<2xi32>) -> tensor<*xf64>
+  return %1 : tensor<*xf64>
+
+  // CHECK-LABEL: random_uniform_no_fold3
+  // CHECK: %[[RANDOM:.*]] = "tf.RandomUniform"
+}
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
index 062895e9b9f..99e7e99f66a 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
@@ -49,6 +49,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 
 namespace mlir {
@@ -114,9 +116,54 @@ DECL_CONVERT_OP(SplitV);
 DECL_CONVERT_OP(StridedSlice);
 DECL_CONVERT_OP(Unpack);
 DECL_CONVERT_OP(Reciprocal);
+DECL_CONVERT_OP(RandomUniform);
 
 #undef DECL_CONVERT_OP
 
+PatternMatchResult ConvertTFRandomUniformOp::matchAndRewrite(
+    Operation* op, PatternRewriter& rewriter) const {
+  auto random_uniform_op = cast<TF::RandomUniformOp>(op);
+  if (random_uniform_op.seed() == 0 && random_uniform_op.seed2() == 0) {
+    return matchFailure();
+  }
+  if (!random_uniform_op.dtype().isF32()) {
+    return matchFailure();
+  }
+  typedef tensorflow::random::UniformDistribution<
+      tensorflow::random::PhiloxRandom, float>
+      Distribution;
+
+  tensorflow::random::PhiloxRandom generator(
+      random_uniform_op.seed().getSExtValue(),
+      random_uniform_op.seed2().getSExtValue());
+  Distribution dist;
+  int num_elements = 0;
+  if (auto output_type =
+          random_uniform_op.output().getType().dyn_cast_or_null<ShapedType>()) {
+    if (auto ranked_output = output_type.dyn_cast_or_null<RankedTensorType>()) {
+      if (!ranked_output.hasRank() || ranked_output.getNumDynamicDims() != 0) {
+        return matchFailure();
+      }
+      num_elements = output_type.getNumElements();
+      size_t offset = 0;
+      size_t num_samples = Distribution::kResultElementCount;
+      llvm::SmallVector<float, 32> data;
+      data.resize(num_elements);
+      while (offset < num_elements) {
+        const typename Distribution::ResultType samples = dist(&generator);
+        std::copy(&samples[0],
+                  &samples[0] + std::min(num_samples, data.size() - offset),
+                  &data[0] + offset);
+        offset += num_samples;
+      }
+      auto output_data = DenseFPElementsAttr::get(output_type, data);
+      rewriter.replaceOpWithNewOp<ConstantOp>(op, output_type, output_data);
+      return matchSuccess();
+    }
+  }
+  return matchFailure();
+}
+
 PatternMatchResult ConvertTFConcatOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_concat_op = cast<TF::ConcatOp>(op);
@@ -521,11 +568,12 @@ void LegalizeTF::runOnFunction() {
 
   // Add the generated patterns to the list.
   populateWithGenerated(ctx, &patterns);
-  patterns.insert<ConvertTFConcatOp, ConvertTFConcatV2Op, ConvertTFMatMulOp,
-                  ConvertTFMatrixDiagV2Op, ConvertTFMatrixDiagV3Op,
-                  ConvertTFPackOp, ConvertTFReshapeOp, ConvertTFSplitOp,
-                  ConvertTFSplitVOp, ConvertTFStridedSliceOp, ConvertTFUnpackOp,
-                  ConvertTFAssertOp, ConvertTFReciprocalOp>(ctx);
+  patterns
+      .insert<ConvertTFConcatOp, ConvertTFConcatV2Op, ConvertTFMatMulOp,
+              ConvertTFMatrixDiagV2Op, ConvertTFMatrixDiagV3Op, ConvertTFPackOp,
+              ConvertTFReshapeOp, ConvertTFSplitOp, ConvertTFSplitVOp,
+              ConvertTFStridedSliceOp, ConvertTFUnpackOp, ConvertTFAssertOp,
+              ConvertTFReciprocalOp, ConvertTFRandomUniformOp>(ctx);
   applyPatternsGreedily(func, patterns);
 }
 

From 6a1c2d5f068f7c6b3edd1314754dcc538952075f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 17:08:53 -0800
Subject: [PATCH 195/442] Implement the new programmatic profiling API.

PiperOrigin-RevId: 295858100
Change-Id: Id5535fda2882ac44c5ab7a59bbdda7093d4d5a5c
---
 tensorflow/python/eager/profiler.py           |   4 +-
 tensorflow/python/profiler/BUILD              |  24 ++++
 tensorflow/python/profiler/internal/BUILD     |   3 +
 .../profiler/internal/profiler_wrapper.cc     |  48 ++++++-
 tensorflow/python/profiler/profiler_v2.py     | 135 ++++++++++++++++++
 .../python/profiler/profiler_v2_test.py       |  93 ++++++++++++
 6 files changed, 302 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/python/profiler/profiler_v2.py
 create mode 100644 tensorflow/python/profiler/profiler_v2_test.py

diff --git a/tensorflow/python/eager/profiler.py b/tensorflow/python/eager/profiler.py
index 13e4a71427d..835a0d72bbf 100644
--- a/tensorflow/python/eager/profiler.py
+++ b/tensorflow/python/eager/profiler.py
@@ -76,7 +76,7 @@ def start():
       context.ensure_initialized()
     _profiler = _pywrap_profiler.ProfilerSession()
     try:
-      _profiler.start()
+      _profiler.start('')
     except errors.AlreadyExistsError:
       logging.warning('Another profiler session is running which is probably '
                       'created by profiler server. Please avoid using profiler '
@@ -157,7 +157,7 @@ def start_profiler_server(port):
   """
   if context.default_execution_mode == context.EAGER_MODE:
     context.ensure_initialized()
-  _pywrap_profiler.start_profiler_server(port)
+  _pywrap_profiler.start_server(port)
 
 
 class Profiler(object):
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index 882f41fd8d8..6c2abbd1f4b 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -41,6 +41,30 @@ cuda_py_test(
     ],
 )
 
+py_library(
+    name = "profiler_v2",
+    srcs = ["profiler_v2.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:util",
+        "//tensorflow/python/profiler/internal:_pywrap_profiler",
+    ],
+)
+
+cuda_py_test(
+    name = "profiler_v2_test",
+    srcs = ["profiler_v2_test.py"],
+    python_version = "PY3",
+    tags = ["no_pip"],
+    deps = [
+        ":profiler_v2",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/profiler:traceme",
+    ],
+)
+
 py_library(
     name = "option_builder",
     srcs = ["option_builder.py"],
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index 0b98a5b0c85..05717904df1 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -118,11 +118,14 @@ tf_python_pybind_extension(
     ],
     deps = [
         "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/convert:xplane_to_profile_response",
         "//tensorflow/core/profiler/lib:profiler_session_headers",
         "//tensorflow/core/profiler/rpc:profiler_server",
         "//tensorflow/core/profiler/rpc/client:capture_profile",
+        "//tensorflow/core/profiler/rpc/client:save_profile",
         "//tensorflow/python:pybind11_status",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/time",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/python/profiler/internal/profiler_wrapper.cc b/tensorflow/python/profiler/internal/profiler_wrapper.cc
index 0072a204429..5c11fbb1cff 100644
--- a/tensorflow/python/profiler/internal/profiler_wrapper.cc
+++ b/tensorflow/python/profiler/internal/profiler_wrapper.cc
@@ -16,10 +16,14 @@ limitations under the License.
 #include <memory>
 
 #include "absl/memory/memory.h"
+#include "absl/time/time.h"
 #include "include/pybind11/pybind11.h"
+#include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/convert/xplane_to_profile_response.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 #include "tensorflow/core/profiler/rpc/client/capture_profile.h"
+#include "tensorflow/core/profiler/rpc/client/save_profile.h"
 #include "tensorflow/core/profiler/rpc/profiler_server.h"
 #include "tensorflow/python/lib/core/pybind11_status.h"
 
@@ -27,10 +31,24 @@ namespace py = ::pybind11;
 
 namespace {
 
+tensorflow::string GetCurrentTimeStampAsString() {
+  return absl::FormatTime("%E4Y-%m-%d_%H:%M:%S", absl::Now(),
+                          absl::LocalTimeZone());
+}
+
+tensorflow::ProfileRequest MakeProfileRequest() {
+  tensorflow::ProfileRequest request;
+  request.add_tools("overview_page");
+  request.add_tools("input_pipeline");
+  request.add_tools("tensorflow_stats");
+  return request;
+}
+
 class ProfilerSessionWrapper {
  public:
-  void Start() {
+  void Start(const char* logdir) {
     session_ = tensorflow::ProfilerSession::Create();
+    logdir_ = logdir;
     tensorflow::MaybeRaiseRegisteredFromStatus(session_->Status());
   }
 
@@ -45,8 +63,31 @@ class ProfilerSessionWrapper {
     return py::bytes(content);
   }
 
+  void ExportToTensorBoard() {
+    if (!session_ || logdir_.empty()) return;
+    tensorflow::profiler::XSpace xspace;
+    tensorflow::Status status;
+    status = session_->CollectData(&xspace);
+    session_.reset();
+    if (!status.ok()) {
+      tensorflow::MaybeRaiseRegisteredFromStatus(status);
+      return;
+    }
+    tensorflow::ProfileResponse response;
+    tensorflow::profiler::ConvertXSpaceToProfileResponse(
+        xspace, MakeProfileRequest(), &response);
+
+    std::stringstream ss;  // Record LOG messages.
+    status = tensorflow::profiler::SaveTensorboardProfile(
+        logdir_, GetCurrentTimeStampAsString(), tensorflow::port::Hostname(),
+        response, &ss);
+    LOG(INFO) << ss.str();
+    tensorflow::MaybeRaiseRegisteredFromStatus(tensorflow::Status::OK());
+  }
+
  private:
   std::unique_ptr<tensorflow::ProfilerSession> session_;
+  tensorflow::string logdir_;
 };
 
 }  // namespace
@@ -56,9 +97,10 @@ PYBIND11_MODULE(_pywrap_profiler, m) {
                                                             "ProfilerSession");
   profiler_session_class.def(py::init<>())
       .def("start", &ProfilerSessionWrapper::Start)
-      .def("stop", &ProfilerSessionWrapper::Stop);
+      .def("stop", &ProfilerSessionWrapper::Stop)
+      .def("export_to_tb", &ProfilerSessionWrapper::ExportToTensorBoard);
 
-  m.def("start_profiler_server", [](int port) {
+  m.def("start_server", [](int port) {
     auto profiler_server = absl::make_unique<tensorflow::ProfilerServer>();
     profiler_server->StartProfilerServer(port);
     // Intentionally release profiler server. Should transfer ownership to
diff --git a/tensorflow/python/profiler/profiler_v2.py b/tensorflow/python/profiler/profiler_v2.py
new file mode 100644
index 00000000000..8401ed43031
--- /dev/null
+++ b/tensorflow/python/profiler/profiler_v2.py
@@ -0,0 +1,135 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow 2.x Profiler.
+
+The profiler has two modes:
+- Programmatic Mode: start(logdir), stop(), and Profiler class. Profiling starts
+                     when calling start(logdir) or create a Profiler class.
+                     Profiling stops when calling stop() to save to
+                     TensorBoard logdir or destroying the Profiler class.
+- Sampling Mode: start_server(). It will perform profiling after receiving a
+                 profiling request.
+
+NOTE: Only one active profiler session is allowed. Use of simultaneous
+Programmatic Mode and Sampling Mode is undefined and will likely fail.
+
+NOTE: The Keras TensorBoard callback will automatically perform sampled
+profiling. Before enabling customized profiling, set the callback flag
+"profile_batches=[]" to disable automatic sampled profiling.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.profiler.internal import _pywrap_profiler
+
+_profiler = None
+_profiler_lock = threading.Lock()
+
+
+def start(logdir):
+  """Starts profiling.
+
+  Args:
+    logdir: A log directory read by TensorBoard to export the profile results.
+
+  Raises:
+    AlreadyExistsError: If another profiling session is running.
+
+  Example usage:
+  ```python
+  tf.profiler.start('logdir_path')
+  # do your training here.
+  tf.profiler.stop()
+  ```
+
+  Launch TensorBoard and point it to the same logdir you provided to this API.
+  $ tensorboard --logdir=logdir_path
+  Open your browser and go to localhost:6006/#profile to view profiling results.
+
+  """
+  global _profiler
+  with _profiler_lock:
+    if _profiler is not None:
+      raise errors.AlreadyExistsError(None, None,
+                                      'Another profiler is running.')
+    _profiler = _pywrap_profiler.ProfilerSession()
+    try:
+      _profiler.start(logdir)
+    except errors.AlreadyExistsError:
+      logging.warning('Another profiler session is running which is probably '
+                      'created by profiler server. Please avoid using profiler '
+                      'server and profiler APIs at the same time.')
+      raise errors.AlreadyExistsError(None, None,
+                                      'Another profiler is running.')
+
+
+def stop(save=True):
+  """Stops the current profiling session.
+
+  The profiler session will be stopped and profile results will be saved.
+
+  Args:
+    save: An optional variable to save the results to TensorBoard. Default True.
+
+  Raises:
+    UnavailableError: If there is no active profiling session.
+  """
+  global _profiler
+  with _profiler_lock:
+    if _profiler is None:
+      raise errors.UnavailableError(
+          None, None,
+          'Cannot export profiling results. No profiler is running.')
+    if save:
+      _profiler.export_to_tb()
+    _profiler = None
+
+
+def start_server(port):
+  """Start a profiler grpc server that listens to given port.
+
+  The profiler server will exit when the process finishes. The service is
+  defined in tensorflow/core/profiler/profiler_service.proto.
+
+  Args:
+    port: port profiler server listens to.
+  """
+  _pywrap_profiler.start_server(port)
+
+
+class Profiler(object):
+  """Context-manager profiler API.
+
+  Example usage:
+  ```python
+  with Profiler("/path/to/logdir"):
+    # do some work
+  ```
+  """
+
+  def __init__(self, logdir):
+    self._logdir = logdir
+
+  def __enter__(self):
+    start(self._logdir)
+
+  def __exit__(self, typ, value, tb):
+    stop()
diff --git a/tensorflow/python/profiler/profiler_v2_test.py b/tensorflow/python/profiler/profiler_v2_test.py
new file mode 100644
index 00000000000..ecea6b89121
--- /dev/null
+++ b/tensorflow/python/profiler/profiler_v2_test.py
@@ -0,0 +1,93 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf 2.x profiler."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import socket
+
+from tensorflow.core.protobuf import trace_events_pb2
+from tensorflow.python.eager import profiler
+from tensorflow.python.eager import test
+from tensorflow.python.framework import config
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import gfile
+from tensorflow.python.profiler import profiler_v2 as profiler
+from tensorflow.python.profiler import traceme
+
+
+class ProfilerTest(test_util.TensorFlowTestCase):
+
+  def test_profile_exceptions(self):
+    logdir = self.get_temp_dir()
+    profiler.start(logdir)
+    with self.assertRaises(errors.AlreadyExistsError):
+      profiler.start(logdir)
+
+    profiler.stop()
+    with self.assertRaises(errors.UnavailableError):
+      profiler.stop()
+
+  def test_save_profile(self):
+    logdir = self.get_temp_dir()
+    profiler.start(logdir)
+    with traceme.TraceMe('three_times_five'):
+      three = constant_op.constant(3)
+      five = constant_op.constant(5)
+      product = three * five
+    self.assertAllEqual(15, product)
+
+    profiler.stop()
+    file_list = gfile.ListDirectory(logdir)
+    self.assertEqual(len(file_list), 2)
+    for file_name in gfile.ListDirectory(logdir):
+      if gfile.IsDirectory(os.path.join(logdir, file_name)):
+        self.assertEqual(file_name, 'plugins')
+      else:
+        self.assertTrue(file_name.endswith('.profile-empty'))
+    profile_dir = os.path.join(logdir, 'plugins/profile/')
+    run = gfile.ListDirectory(profile_dir)[0]
+    hostname = socket.gethostname()
+    overview_page = os.path.join(profile_dir, run,
+                                 hostname + '.overview_page.pb')
+    self.assertTrue(gfile.Exists(overview_page))
+    input_pipeline = os.path.join(profile_dir, run,
+                                  hostname + '.input_pipeline.pb')
+    self.assertTrue(gfile.Exists(input_pipeline))
+    tensorflow_stats = os.path.join(profile_dir, run,
+                                    hostname + '.tensorflow_stats.pb')
+    self.assertTrue(gfile.Exists(tensorflow_stats))
+
+    trace_file = os.path.join(profile_dir, run, hostname + '.trace')
+    self.assertTrue(gfile.Exists(trace_file))
+    with gfile.Open(trace_file, 'rb') as f:
+      profile_pb = trace_events_pb2.Trace()
+      profile_pb.ParseFromString(f.read())
+    devices = frozenset(device.name for device in profile_pb.devices.values())
+    self.assertIn('/host:CPU', devices)
+    if config.list_physical_devices('GPU'):
+      self.assertIn('/device:GPU:0', devices)
+    events = frozenset(event.name for event in profile_pb.trace_events)
+    self.assertIn('three_times_five', events)
+    self.assertIn('Mul:Mul', events)
+
+
+if __name__ == '__main__':
+  test.main()

From 51615f986822b847f17c3c953bca9261522fe851 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Tue, 18 Feb 2020 17:13:15 -0800
Subject: [PATCH 196/442] Define a new TF toolchain platform that runs on
 windows 2019

PiperOrigin-RevId: 295858766
Change-Id: I8ce497a66cafde0e50428ed298d7669be39a0997
---
 tensorflow/opensource_only.files           |  1 +
 third_party/toolchains/preconfig/win/BUILD | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+)
 create mode 100644 third_party/toolchains/preconfig/win/BUILD

diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index c282a6021ee..4d39efad106 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -268,6 +268,7 @@ tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/build_defs.b
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt6.0/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt6.0/build_defs.bzl
+tensorflow/third_party/toolchains/preconfig/win/BUILD
 tensorflow/third_party/toolchains/preconfig/win_1803/BUILD
 tensorflow/third_party/toolchains/preconfig/win_1803/bazel_025/BUILD
 tensorflow/third_party/toolchains/preconfig/win_1803/bazel_026/BUILD
diff --git a/third_party/toolchains/preconfig/win/BUILD b/third_party/toolchains/preconfig/win/BUILD
new file mode 100644
index 00000000000..519d8e5110d
--- /dev/null
+++ b/third_party/toolchains/preconfig/win/BUILD
@@ -0,0 +1,21 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+java_runtime(
+    name = "windows_jdk8",
+    srcs = [],
+    java_home = "C:/openjdk",
+)
+
+platform(
+    name = "rbe_windows_ltsc2019",
+    constraint_values = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:windows",
+    ],
+    exec_properties = {
+        "container-image": "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:5e91ddd99345204cd8da2e687d312eb64b3916f257023fd1b651b3dabefd9286",
+        "OSFamily": "Windows",
+    },
+)

From 93264c2830d3ac041a0c19b305e934e8c4f0d1d5 Mon Sep 17 00:00:00 2001
From: Jiho Choi <jihochoi@google.com>
Date: Tue, 18 Feb 2020 17:17:06 -0800
Subject: [PATCH 197/442] Add a missing name scope.

PiperOrigin-RevId: 295859433
Change-Id: I1a37cdaf61e4879f6fdfcf3100479fb57ffb95e9
---
 tensorflow/core/profiler/utils/group_events.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/profiler/utils/group_events.cc b/tensorflow/core/profiler/utils/group_events.cc
index 687c17280f0..3c0b7d50f56 100644
--- a/tensorflow/core/profiler/utils/group_events.cc
+++ b/tensorflow/core/profiler/utils/group_events.cc
@@ -236,7 +236,7 @@ void GroupTfEvents(XSpace* space, EventGroupNameMap* event_group_name_map) {
         {StatType::kStepId}},
        {HostEventType::kExecutorStateProcess,
         HostEventType::kIteratorGetNextOp,
-        {StatType::kStepId, kIterNum}},
+        {StatType::kStepId, StatType::kIterNum}},
        {HostEventType::kKernelLaunch,
         HostEventType::kKernelExecute,
         {StatType::kCorrelationId}}});

From 0623e844ccf2717590fb9e9ff2843bb95b85ab26 Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Tue, 18 Feb 2020 17:17:47 -0800
Subject: [PATCH 198/442] Abstract out the path separator and make use of it in
 JoinPath.

This lays some ground work for correctly dealing with paths on Windows.

PiperOrigin-RevId: 295859552
Change-Id: I72eb50f69c33df0916bd68e90196819f7b22ed2c
---
 tensorflow/core/platform/path.cc | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/platform/path.cc b/tensorflow/core/platform/path.cc
index ae705373a67..5c99b4eb68a 100644
--- a/tensorflow/core/platform/path.cc
+++ b/tensorflow/core/platform/path.cc
@@ -36,6 +36,11 @@ limitations under the License.
 namespace tensorflow {
 namespace io {
 namespace internal {
+namespace {
+
+const char kPathSep[] = "/";
+
+}  // namespace
 
 string JoinPathImpl(std::initializer_list<StringPiece> paths) {
   string result;
@@ -48,18 +53,12 @@ string JoinPathImpl(std::initializer_list<StringPiece> paths) {
       continue;
     }
 
-    if (result[result.size() - 1] == '/') {
-      if (IsAbsolutePath(path)) {
-        strings::StrAppend(&result, path.substr(1));
-      } else {
-        strings::StrAppend(&result, path);
-      }
+    if (IsAbsolutePath(path)) path = path.substr(1);
+
+    if (result[result.size() - 1] == kPathSep[0]) {
+      strings::StrAppend(&result, path);
     } else {
-      if (IsAbsolutePath(path)) {
-        strings::StrAppend(&result, path);
-      } else {
-        strings::StrAppend(&result, "/", path);
-      }
+      strings::StrAppend(&result, kPathSep, path);
     }
   }
 
@@ -107,6 +106,7 @@ std::pair<StringPiece, StringPiece> SplitBasename(StringPiece path) {
       StringPiece(path.data(), pos),
       StringPiece(path.data() + pos + 1, path.size() - (pos + 1)));
 }
+
 }  // namespace internal
 
 bool IsAbsolutePath(StringPiece path) {

From 11a50f8873deda9d34152eaaf5f4d9f57f519438 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 17:23:44 -0800
Subject: [PATCH 199/442] Remove NumPy 1.13 workaround in test.

PiperOrigin-RevId: 295860559
Change-Id: Id80d310a3e61bb2206c721387a1f3cc0975f3ac5
---
 tensorflow/python/kernel_tests/sparse_ops_test.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index 9982f000151..e4cc2046c64 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -705,9 +705,6 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
   def testGradient(self):
-    if np.__version__ == "1.13.0":
-      self.skipTest("numpy 1.13.0 bug")
-
     np.random.seed(8161)
     test_dims = [(11, 1, 5, 7, 1), (2, 2)]
     with self.session(use_gpu=False):

From b092dd17335aae4d970ddaa433fb6d2096f3feb6 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 18 Feb 2020 17:34:51 -0800
Subject: [PATCH 200/442] [TF:MLIR] Move Transpose operations across layout
 agnostic ops Part #1

PiperOrigin-RevId: 295862379
Change-Id: Ic2c71acb48cfb1274fafe5b4846e96048c36cdb7
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    |   6 +-
 .../compiler/mlir/tensorflow/ir/tf_op_base.td |   4 +
 .../compiler/mlir/tensorflow/ir/tf_traits.h   |   5 +
 ...ayout_optimization_layout_assignment.mlir} |   2 +-
 .../layout_optimization_move_transposes.mlir  |  67 ++++++++++
 .../transforms/layout_optimization.cc         | 123 +++++++++++++++++-
 6 files changed, 201 insertions(+), 6 deletions(-)
 rename tensorflow/compiler/mlir/tensorflow/tests/{layout_optimization.mlir => layout_optimization_layout_assignment.mlir} (97%)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes.mlir

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index ad00ab222a4..1d8dd178189 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -49,7 +49,7 @@ an output element, this operation computes \\(y = |x|\\).
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_AddOp : TF_Op<"Add", [NoSideEffect, ResultsBroadcastableShape]>,
+def TF_AddOp : TF_Op<"Add", [NoSideEffect, ResultsBroadcastableShape, TF_LayoutAgnostic]>,
                WithBroadcastableBinOpBuilder {
   let summary = "Returns x + y element-wise.";
 
@@ -98,7 +98,7 @@ Inputs must be of same size and shape.
   let hasFolder = 1;
 }
 
-def TF_AddV2Op : TF_Op<"AddV2", [Commutative, NoSideEffect, ResultsBroadcastableShape]>,
+def TF_AddV2Op : TF_Op<"AddV2", [Commutative, NoSideEffect, ResultsBroadcastableShape, TF_LayoutAgnostic]>,
                  WithBroadcastableBinOpBuilder {
   let summary = "Returns x + y element-wise.";
 
@@ -6781,7 +6781,7 @@ variables.
   TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<0>;
 }
 
-def TF_TanhOp : TF_Op<"Tanh", [NoSideEffect, SameOperandsAndResultType]> {
+def TF_TanhOp : TF_Op<"Tanh", [NoSideEffect, SameOperandsAndResultType, TF_LayoutAgnostic]> {
   let summary = "Computes hyperbolic tangent of `x` element-wise.";
 
   let description = [{
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index b8d5e59f1a8..f3fdab674e4 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -58,6 +58,10 @@ TODO: Make invariants more structured so that we can reference them in ops.
 def TF_OperandsSameAsResultsTypeOrRef : NativeOpTrait<
   "TF::OperandsSameAsResultsTypeOrRef">;
 
+// Layout agnostic operations do not depend on the operands data layout (data
+// format), as an example all element wise operations are layout agnostic.
+def TF_LayoutAgnostic : NativeOpTrait<"TF::LayoutAgnostic">;
+
 //===----------------------------------------------------------------------===//
 // TensorFlow op definitions
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
index 51315c4f90c..18beb23663c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
@@ -68,6 +68,11 @@ class OperandsSameAsResultsTypeOrRef
   }
 };
 
+// Layout agnostic operations do not depend on the operands data layout (data
+// format), as and example all element wise operations are layout agnostic.
+template <typename ConcreteType>
+class LayoutAgnostic : public TraitBase<ConcreteType, LayoutAgnostic> {};
+
 }  // namespace TF
 }  // namespace OpTrait
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment.mlir
similarity index 97%
rename from tensorflow/compiler/mlir/tensorflow/tests/layout_optimization.mlir
rename to tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment.mlir
index f632e657421..e8d667aea0f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -tf-layout-assignment=force-data-format=NCHW -verify-diagnostics | FileCheck %s
+// RUN: tf-opt %s -tf-layout-assignment=force-data-format=NCHW -verify-diagnostics | FileCheck %s --dump-input=always
 
 // CHECK-LABEL: func @transposeBiasAdd
 func @transposeBiasAdd(%arg0: tensor<1x4x4x8xf32>, %arg1: tensor<8xf32>) -> tensor<1x4x4x8xf32> {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes.mlir
new file mode 100644
index 00000000000..19b85393d78
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes.mlir
@@ -0,0 +1,67 @@
+// RUN: tf-opt %s -tf-move-transposes -verify-diagnostics | FileCheck %s --dump-input=always
+
+// CHECK-LABEL: func @move_across_single_op
+func @move_across_single_op(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
+
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
+  // CHECK: %[[TANH:[0-9]*]] = "tf.Tanh"(%[[ARG_TRANSPOSE]]) {{.*}} tensor<1x8x4x4xf32>
+  // CHECK: return %[[TANH]]
+
+  %0 = "tf.Tanh"(%arg0) : (tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
+  %1 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+  %2 = "tf.Transpose"(%0, %1) : (tensor<1x4x4x8xf32>, tensor<4xi64>) -> tensor<1x8x4x4xf32>
+
+  return %2 : tensor<1x8x4x4xf32>
+}
+
+// CHECK-LABEL: func @move_across_multiple_ops
+func @move_across_multiple_ops(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
+
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
+  // CHECK: %[[TANH0:[0-9]*]] = "tf.Tanh"(%[[ARG_TRANSPOSE]]) {{.*}} tensor<1x8x4x4xf32>
+  // CHECK: %[[TANH1:[0-9]*]] = "tf.Tanh"(%[[TANH0]]) {{.*}} tensor<1x8x4x4xf32>
+  // CHECK: return %[[TANH1]]
+
+  %0 = "tf.Tanh"(%arg0) : (tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
+  %1 = "tf.Tanh"(%0) : (tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
+
+  %2 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+  %3 = "tf.Transpose"(%1, %2) : (tensor<1x4x4x8xf32>, tensor<4xi64>) -> tensor<1x8x4x4xf32>
+
+  return %3 : tensor<1x8x4x4xf32>
+}
+
+// CHECK-LABEL: func @move_across_multi_operand_op
+func @move_across_multi_operand_op(%arg0: tensor<1x4x4x8xf32>, %arg1: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
+
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[ARG0_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
+  // CHECK: %[[ARG1_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg1, %[[ARG_PERM]])
+  // CHECK: %[[ADD:[0-9]*]] = "tf.AddV2"(%[[ARG0_TRANSPOSE]], %[[ARG1_TRANSPOSE]]) {{.*}} tensor<1x8x4x4xf32>
+  // CHECK: return %[[ADD]]
+
+  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<1x4x4x8xf32>, tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
+  %1 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+  %2 = "tf.Transpose"(%0, %1) : (tensor<1x4x4x8xf32>, tensor<4xi64>) -> tensor<1x8x4x4xf32>
+
+  return %2 : tensor<1x8x4x4xf32>
+}
+
+// CHECK-LABEL: func @move_with_multiple_uses
+func @move_with_multiple_uses(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
+
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
+  // CHECK: %[[TANH:[0-9]*]] = "tf.Tanh"(%[[ARG_TRANSPOSE]]) {{.*}} tensor<1x8x4x4xf32>
+  // CHECK: %[[ADD:[0-9]*]] = "tf.AddV2"(%[[TANH]], %[[TANH]]) {{.*}} tensor<1x8x4x4xf32>
+  // CHECK: return %[[ADD]]
+
+  %0 = "tf.Tanh"(%arg0) : (tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
+  %1 = "tf.AddV2"(%0, %0) : (tensor<1x4x4x8xf32>, tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
+  %2 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+  %3 = "tf.Transpose"(%1, %2) : (tensor<1x4x4x8xf32>, tensor<4xi64>) -> tensor<1x8x4x4xf32>
+
+  return %3 : tensor<1x8x4x4xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
index 24624e356ea..4e74ed9f0e0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/IR/Attributes.h"  // TF:llvm-project
+#include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
 #include "mlir/Pass/Pass.h"  // TF:llvm-project
 #include "mlir/Pass/PassRegistry.h"  // TF:llvm-project
@@ -25,6 +28,8 @@ namespace TF {
 
 namespace {
 
+// LayoutAssignmentPass assigns optimal data layout (data format) for all
+// layout sensitive operations.
 class LayoutAssignmentPass : public FunctionPass<LayoutAssignmentPass> {
  public:
   LayoutAssignmentPass() = default;
@@ -39,6 +44,14 @@ class LayoutAssignmentPass : public FunctionPass<LayoutAssignmentPass> {
       llvm::cl::desc("Force data format for all layout sensitive ops")};
 };
 
+// MoveTransposesPass moves all Transpose ops to the beginning or to the end of
+// the basic block where they are defined. This will allow canonicalzer to
+// delete redundant transposes.
+class MoveTransposesPass : public FunctionPass<MoveTransposesPass> {
+ public:
+  void runOnFunction() final;
+};
+
 using Permutation = SmallVector<int64_t, 4>;
 
 Permutation GetDataFormatPermutation(StringRef from_data_format,
@@ -128,10 +141,116 @@ void LayoutAssignmentPass::runOnFunction() {
   });
 }
 
+// Move Transpose operations that permute `op` results before the `op`.
+void MoveTransposeBefore(Operation* op, SmallVector<Operation*, 8>* work_list) {
+  // TODO(ezhulenev): Move transpose across layout sensitive operations.
+  if (!op->hasTrait<OpTrait::TF::LayoutAgnostic>()) return;
+
+  // Transpose operations that use operation results.
+  SmallVector<TransposeOp, 2> transpose_ops;
+
+  // Constant operation that defines permutation indices for result transposes.
+  ConstOp permutation_op;
+
+  // All operation results must be used by transpose operations with the same
+  // permutation indices.
+  for (OpResult result : op->getResults()) {
+    for (Operation* user : result.getUsers()) {
+      // Result user must be a transpose operation.
+      TransposeOp transpose = dyn_cast<TransposeOp>(user);
+      if (!transpose) return;
+
+      // With permutation defined by constant operation.
+      ConstOp perm =
+          dyn_cast_or_null<ConstOp>(transpose.getOperand(1).getDefiningOp());
+      if (!perm) return;
+
+      // With the same permutation indices.
+      auto dense_elem_attr = perm.value().dyn_cast<DenseElementsAttr>();
+      if (!dense_elem_attr) return;
+
+      if (!permutation_op) permutation_op = perm;
+
+      // Check that permutation matches for all result transposes.
+      if (perm.value() != permutation_op.value()) return;
+
+      // Add a transpose operation for later reuse.
+      transpose_ops.push_back(transpose);
+    }
+  }
+
+  // Nothing to do here.
+  if (!permutation_op || transpose_ops.empty()) return;
+
+  // At this point we checked that we can safely move Transpose node before
+  // `op`, and bypass all result transposes.
+  Location loc = op->getLoc();
+
+  // Move constant op defining result permutation to the beginning of the block.
+  permutation_op.getOperation()->moveBefore(&op->getBlock()->front());
+
+  // Bypass Transpose nodes for all results.
+  for (OpResult result : op->getResults()) {
+    result.setType(cast<TransposeOp>(*result.getUsers().begin()).y().getType());
+    for (Operation* transpose : result.getUsers()) {
+      transpose->getResult(0).replaceAllUsesWith(result);
+    }
+  }
+
+  // Maybe add a Transpose node for all operands (or reuse existing transposes).
+  OpBuilder builder(op);
+  builder.setInsertionPoint(op);
+
+  for (OpOperand& operand : op->getOpOperands()) {
+    // Try to push transpose further up.
+    if (Operation* operand_op = operand.get().getDefiningOp())
+      work_list->push_back(operand_op);
+
+    // Try to reuse result transposes.
+    TransposeOp transpose;
+    if (!transpose_ops.empty()) {
+      transpose = transpose_ops.pop_back_val();
+      transpose.getOperation()->moveBefore(op);
+      transpose.setOperand(0, operand.get());
+      transpose.setOperand(1, permutation_op);
+    } else {
+      transpose =
+          builder.create<TransposeOp>(loc, operand.get(), permutation_op);
+    }
+
+    operand.set(transpose);
+  }
+
+  // Remove unused transpose operations.
+  while (!transpose_ops.empty()) {
+    TransposeOp transpose = transpose_ops.pop_back_val();
+    transpose.erase();
+  }
+}
+
+void MoveTransposesPass::runOnFunction() {
+  FuncOp func = getFunction();
+
+  SmallVector<Operation*, 8> work_list;
+
+  func.walk([&](TransposeOp transpose) {
+    for (auto operand : transpose.getOperands()) {
+      if (auto op = operand.getDefiningOp()) work_list.push_back(op);
+    }
+  });
+
+  while (!work_list.empty()) {
+    Operation* op = work_list.pop_back_val();
+    MoveTransposeBefore(op, &work_list);
+  }
+}
+
 }  // namespace
 
-static PassRegistration<LayoutAssignmentPass> pass("tf-layout-assignment",
-                                                   "Layout assignment pass");
+static PassRegistration<LayoutAssignmentPass> layout_assignment(
+    "tf-layout-assignment", "Layout assignment pass");
+static PassRegistration<MoveTransposesPass> move_transposes(
+    "tf-move-transposes", "Move transposes pass");
 
 }  // namespace TF
 }  // namespace mlir

From 4ebb57d8d138f4b26b0de16036aa3086cda8b330 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Tue, 18 Feb 2020 17:41:01 -0800
Subject: [PATCH 201/442] [XLA:Python] Add `profiler_session` dep to
 `xla_extension` BUILD rule.

https://github.com/tensorflow/tensorflow/commit/767e4d5dabeae612284ff45284c5b3f4e0679766
changed the `profiler_service_impl` BUILD rule to only depend on
`profiler_session_headers`. Add the definitions back to
`xla_extension` to avoid "symbol not found" errors.

PiperOrigin-RevId: 295863418
Change-Id: Id9cdf1cc2d6dc2cdf6fec95e76eee15cf5e3b7be
---
 tensorflow/compiler/xla/python/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 9fc0c5b04d0..44f7061d1ac 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -372,6 +372,7 @@ pybind_extension(
         # not require Tensorflow.
         "//tensorflow/core:lib_internal_impl",  # buildcleaner: keep
         "//tensorflow/core/profiler/lib:profiler_backends",
+        "//tensorflow/core/profiler/lib:profiler_session",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/profiler/rpc:profiler_server",
         "//tensorflow/stream_executor:device_memory_allocator",

From 6c34ce08b22c487794d521422410c05022acc865 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 17:47:49 -0800
Subject: [PATCH 202/442] Relax relative numerical error tolerance for bilinear
 resize.

PiperOrigin-RevId: 295864434
Change-Id: I3bbd9d4b305a61c045eb23e0702b22304e62b0ad
---
 tensorflow/core/kernels/resize_bilinear_op_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/resize_bilinear_op_test.cc b/tensorflow/core/kernels/resize_bilinear_op_test.cc
index bf6a92d671a..4873b49612d 100644
--- a/tensorflow/core/kernels/resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op_test.cc
@@ -143,7 +143,7 @@ class ResizeBilinearOpTestBase
         TensorShape({batch_size, output_width, output_height, channels})));
     ResizeBilinearBaseline(input->tensor<float, 4>(),
                            expected->tensor<float, 4>());
-    test::ExpectClose(*expected, *GetOutput(0), /*atol=*/1e-5);
+    test::ExpectClose(*expected, *GetOutput(0), /*atol=*/3e-5);
   }
 
   void RunManyRandomTests(int channels) {

From 0b5e649a09222b0294fc532cffe6b3d4f7a29fdf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 17:52:02 -0800
Subject: [PATCH 203/442] Add an environment variable to force Conv algorithm
 to use CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM.

PiperOrigin-RevId: 295865060
Change-Id: I4c21a16940f6a164203d6af54905da79e0593e29
---
 tensorflow/stream_executor/cuda/cuda_dnn.cc | 36 +++++++++++++++------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 45b95a5c14e..130841dde5f 100755
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -677,6 +677,18 @@ bool RequireCudnnDeterminism() {
   return require_cudnn_determinism;
 }
 
+// A helper function to decide whether to force the default conv algorithm.
+bool ConvUseDefaultAlgorithm() {
+  static bool use_default = [] {
+    bool use_default = false;
+    TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_USE_DEFAULT_CONV_ALGO",
+                                               /*default_val=*/false,
+                                               &use_default));
+    return use_default;
+  }();
+  return use_default;
+}
+
 std::tuple<int, int> GetCcMajorMinor(Stream* stream) {
   int cc_major, cc_minor;
   stream->parent()->GetDeviceDescription().cuda_compute_capability(&cc_major,
@@ -3337,21 +3349,27 @@ bool CudnnSupport::GetConvolveAlgorithms(
   bool tensor_op_math_available = TensorOpMathAvailable(cc_major);
   out_algorithms->clear();
 
-  std::vector<dnn::AlgorithmDesc::Index> algo_types = {
-      // clang-format off
+  std::vector<dnn::AlgorithmDesc::Index> algo_types;
+  if (ConvUseDefaultAlgorithm()) {
+    // Force a fallback algorithm.
+    algo_types = {CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM};
+  } else {
+    algo_types = {
+        // clang-format off
     CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
     CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
     CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
     CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
     CUDNN_CONVOLUTION_FWD_ALGO_FFT,
     CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
-      // clang-format on
-  };
-  if (CudnnEnvVar<FftTilingForward>::IsEnabled()) {
-    algo_types.push_back(CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING);
-  }
-  if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
-    algo_types.push_back(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED);
+        // clang-format on
+    };
+    if (CudnnEnvVar<FftTilingForward>::IsEnabled()) {
+      algo_types.push_back(CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING);
+    }
+    if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
+      algo_types.push_back(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED);
+    }
   }
 
   // The algorithms are intentionally ordered for deterministic operation

From 399a62bc6406834f167746a4061e7dd057ec8d9a Mon Sep 17 00:00:00 2001
From: Juho Ha <juhoha@google.com>
Date: Tue, 18 Feb 2020 18:42:56 -0800
Subject: [PATCH 204/442] Add PlatformProfiler to support op tracing using
 platform tracing tools.

PiperOrigin-RevId: 295872277
Change-Id: I8c02ec3974cd246bab70b47426778e9dda5938ee
---
 tensorflow/lite/BUILD                         | 14 +++-
 tensorflow/lite/interpreter.cc                | 12 ++++
 tensorflow/lite/interpreter.h                 | 12 ++++
 tensorflow/lite/model.cc                      |  8 +++
 tensorflow/lite/profiling/BUILD               | 25 +++++++
 tensorflow/lite/profiling/atrace_profiler.cc  | 72 +++++++++++++++++++
 tensorflow/lite/profiling/atrace_profiler.h   | 53 ++++++++++++++
 .../lite/profiling/platform_profiler.cc       | 37 ++++++++++
 tensorflow/lite/profiling/platform_profiler.h | 30 ++++++++
 9 files changed, 262 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/lite/profiling/atrace_profiler.cc
 create mode 100644 tensorflow/lite/profiling/atrace_profiler.h
 create mode 100644 tensorflow/lite/profiling/platform_profiler.cc
 create mode 100644 tensorflow/lite/profiling/platform_profiler.h

diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 4c212785694..e9539d42f75 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -16,6 +16,13 @@ exports_files(glob([
     "models/testdata/*",
 ]))
 
+config_setting(
+    name = "enable_default_profiler",
+    values = {
+        "copt": "-DTFLITE_ENABLE_DEFAULT_PROFILER",
+    },
+)
+
 config_setting(
     name = "gemmlowp_profiling",
     values = {
@@ -239,7 +246,12 @@ cc_library(
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/schema:schema_fbs",
-    ],
+    ] + select({
+        ":enable_default_profiler": [
+            "//tensorflow/lite/profiling:platform_profiler",
+        ],
+        "//conditions:default": [],
+    }),
     alwayslink = 1,
 )
 
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index b839ffddd29..d333fa736e3 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -349,6 +349,18 @@ TfLiteStatus Interpreter::GetBufferHandle(int tensor_index,
 }
 
 void Interpreter::SetProfiler(Profiler* profiler) {
+  // Release resources occupied by owned_profiler_ which is replaced by
+  // caller-owned profiler.
+  owned_profiler_.reset(nullptr);
+  SetSubgraphProfiler(profiler);
+}
+
+void Interpreter::SetProfiler(std::unique_ptr<Profiler> profiler) {
+  owned_profiler_ = std::move(profiler);
+  SetSubgraphProfiler(owned_profiler_.get());
+}
+
+void Interpreter::SetSubgraphProfiler(Profiler* profiler) {
   for (int subgraph_index = 0; subgraph_index < subgraphs_.size();
        ++subgraph_index) {
     subgraphs_[subgraph_index]->SetProfiler(profiler, subgraph_index);
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index 4b4945cd8ac..093390afbb7 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -410,6 +410,11 @@ class Interpreter {
   /// WARNING: This is an experimental API and subject to change.
   void SetProfiler(Profiler* profiler);
 
+  /// Same as SetProfiler except this interpreter takes ownership
+  /// of the provided profiler.
+  /// WARNING: This is an experimental API and subject to change.
+  void SetProfiler(std::unique_ptr<Profiler> profiler);
+
   /// Gets the profiler used for op tracing.
   /// WARNING: This is an experimental API and subject to change.
   Profiler* GetProfiler();
@@ -496,6 +501,9 @@ class Interpreter {
                                  TfLiteExternalContextType type,
                                  TfLiteExternalContext* ctx);
 
+  // Sets the profiler to all subgraphs.
+  void SetSubgraphProfiler(Profiler* profiler);
+
   // A pure C data structure used to communicate with the pure C plugin
   // interface. To avoid copying tensor metadata, this is also the definitive
   // structure to store tensors.
@@ -511,6 +519,10 @@ class Interpreter {
   // TODO(b/116667551): Use TfLiteExternalContext for storing state.
   std::vector<TfLiteDelegatePtr> owned_delegates_;
 
+  // Profiler that has been installed and is owned by this interpreter instance.
+  // Useful if client profiler ownership is burdensome.
+  std::unique_ptr<Profiler> owned_profiler_;
+
   bool allow_buffer_handle_output_ = false;
 
   // List of active external contexts.
diff --git a/tensorflow/lite/model.cc b/tensorflow/lite/model.cc
index 46fee7fa1c8..22a4cf21213 100644
--- a/tensorflow/lite/model.cc
+++ b/tensorflow/lite/model.cc
@@ -29,6 +29,10 @@ limitations under the License.
 #include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
 
+#if defined(TFLITE_ENABLE_DEFAULT_PROFILER)
+#include "tensorflow/lite/profiling/platform_profiler.h"
+#endif
+
 namespace tflite {
 
 namespace {
@@ -687,6 +691,10 @@ TfLiteStatus InterpreterBuilder::operator()(
     (*interpreter)->AddSubgraphs(subgraphs->Length() - 1);
   }
 
+#if defined(TFLITE_ENABLE_DEFAULT_PROFILER)
+  (*interpreter)->SetProfiler(tflite::profiling::CreatePlatformProfiler());
+#endif
+
   for (int subgraph_index = 0; subgraph_index < subgraphs->Length();
        ++subgraph_index) {
     const tflite::SubGraph* subgraph = (*subgraphs)[subgraph_index];
diff --git a/tensorflow/lite/profiling/BUILD b/tensorflow/lite/profiling/BUILD
index 03dd5054c17..94c6a3c6613 100644
--- a/tensorflow/lite/profiling/BUILD
+++ b/tensorflow/lite/profiling/BUILD
@@ -23,6 +23,31 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "atrace_profiler",
+    srcs = ["atrace_profiler.cc"],
+    hdrs = ["atrace_profiler.h"],
+    copts = common_copts,
+    visibility = ["//visibility:private"],
+    deps = [
+        "//tensorflow/lite/core/api",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "platform_profiler",
+    srcs = ["platform_profiler.cc"],
+    hdrs = ["platform_profiler.h"],
+    copts = common_copts,
+    deps = [
+        "//tensorflow/lite/core/api",
+    ] + select({
+        "//tensorflow:android": [":atrace_profiler"],
+        "//conditions:default": [],
+    }),
+)
+
 cc_test(
     name = "profiler_test",
     srcs = ["profiler_test.cc"],
diff --git a/tensorflow/lite/profiling/atrace_profiler.cc b/tensorflow/lite/profiling/atrace_profiler.cc
new file mode 100644
index 00000000000..8fe36416082
--- /dev/null
+++ b/tensorflow/lite/profiling/atrace_profiler.cc
@@ -0,0 +1,72 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/profiling/atrace_profiler.h"
+
+#include <dlfcn.h>
+
+#include "absl/strings/str_cat.h"
+
+namespace tflite {
+namespace profiling {
+
+ATraceProfiler::ATraceProfiler() {
+  handle_ = dlopen("libandroid.so", RTLD_NOW | RTLD_LOCAL);
+  if (handle_) {
+    // Use dlsym() to prevent crashes on devices running Android 5.1
+    // (API level 22) or lower.
+    atrace_is_enabled_ =
+        reinterpret_cast<FpIsEnabled>(dlsym(handle_, "ATrace_isEnabled"));
+    atrace_begin_section_ =
+        reinterpret_cast<FpBeginSection>(dlsym(handle_, "ATrace_beginSection"));
+    atrace_end_section_ =
+        reinterpret_cast<FpEndSection>(dlsym(handle_, "ATrace_endSection"));
+
+    if (!atrace_is_enabled_ || !atrace_begin_section_ || !atrace_end_section_) {
+      dlclose(handle_);
+      handle_ = nullptr;
+    }
+  }
+}
+
+ATraceProfiler::~ATraceProfiler() {
+  if (handle_) {
+    dlclose(handle_);
+  }
+}
+
+uint32_t ATraceProfiler::BeginEvent(const char* tag, EventType event_type,
+                                    uint32_t event_metadata,
+                                    uint32_t event_subgraph_index) {
+  if (handle_ && atrace_is_enabled_()) {
+    // Note: When recording an OPERATOR_INVOKE_EVENT, we have recorded the op
+    // name as tag and node index as event_metadata. See the macro
+    // TFLITE_SCOPED_TAGGED_OPERATOR_PROFILE defined in
+    // tensorflow/lite/core/api/profiler.h for details.
+    // op_name@node_index/subgraph_index
+    std::string trace_event_tag =
+        absl::StrCat(tag, "@", event_metadata, "/", event_subgraph_index);
+    atrace_begin_section_(trace_event_tag.c_str());
+  }
+  return 0;
+}
+
+void ATraceProfiler::EndEvent(uint32_t event_handle) {
+  if (handle_) {
+    atrace_end_section_();
+  }
+}
+
+}  // namespace profiling
+}  // namespace tflite
diff --git a/tensorflow/lite/profiling/atrace_profiler.h b/tensorflow/lite/profiling/atrace_profiler.h
new file mode 100644
index 00000000000..fcfb9f807ae
--- /dev/null
+++ b/tensorflow/lite/profiling/atrace_profiler.h
@@ -0,0 +1,53 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_ATRACE_PROFILER_H_
+#define TENSORFLOW_LITE_PROFILING_ATRACE_PROFILER_H_
+
+#include <type_traits>
+
+#include "tensorflow/lite/core/api/profiler.h"
+
+namespace tflite {
+namespace profiling {
+
+// Profiler reporting to ATrace.
+class ATraceProfiler : public tflite::Profiler {
+ public:
+  ATraceProfiler();
+
+  ~ATraceProfiler() override;
+
+  uint32_t BeginEvent(const char* tag, EventType event_type,
+                      uint32_t event_metadata,
+                      uint32_t event_subgraph_index) override;
+
+  void EndEvent(uint32_t event_handle) override;
+
+ private:
+  using FpIsEnabled = std::add_pointer<bool()>::type;
+  using FpBeginSection = std::add_pointer<void(const char*)>::type;
+  using FpEndSection = std::add_pointer<void()>::type;
+
+  // Handle to libandroid.so library. Null if not supported.
+  void* handle_;
+  FpIsEnabled atrace_is_enabled_;
+  FpBeginSection atrace_begin_section_;
+  FpEndSection atrace_end_section_;
+};
+
+}  // namespace profiling
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PROFILING_ATRACE_PROFILER_H_
diff --git a/tensorflow/lite/profiling/platform_profiler.cc b/tensorflow/lite/profiling/platform_profiler.cc
new file mode 100644
index 00000000000..bbf5e178d66
--- /dev/null
+++ b/tensorflow/lite/profiling/platform_profiler.cc
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/profiling/platform_profiler.h"
+
+#include <memory>
+
+#include "tensorflow/lite/core/api/profiler.h"
+
+#if defined(__ANDROID__)
+#include "tensorflow/lite/profiling/atrace_profiler.h"
+#endif
+
+namespace tflite {
+namespace profiling {
+
+std::unique_ptr<tflite::Profiler> CreatePlatformProfiler() {
+#if defined(__ANDROID__)
+  return std::unique_ptr<tflite::Profiler>(new ATraceProfiler());
+#else
+  return std::unique_ptr<tflite::Profiler>(nullptr);
+#endif
+}
+
+}  // namespace profiling
+}  // namespace tflite
diff --git a/tensorflow/lite/profiling/platform_profiler.h b/tensorflow/lite/profiling/platform_profiler.h
new file mode 100644
index 00000000000..87361b30b50
--- /dev/null
+++ b/tensorflow/lite/profiling/platform_profiler.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_PLATFORM_PROFILER_H_
+#define TENSORFLOW_LITE_PROFILING_PLATFORM_PROFILER_H_
+
+#include <memory>
+
+#include "tensorflow/lite/core/api/profiler.h"
+
+namespace tflite {
+namespace profiling {
+
+std::unique_ptr<tflite::Profiler> CreatePlatformProfiler();
+
+}  // namespace profiling
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PROFILING_PLATFORM_PROFILER_H_

From ea13922cf19c620dcbe870fa7fd6432c196e7192 Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Tue, 18 Feb 2020 18:57:20 -0800
Subject: [PATCH 205/442] Remove the depthwise conv 3x3 output shift > 0
 restriction for per-channel. (it's a obsolete restriction), we're using
 sqrshl, so it's fine to handle output_shift <= 0 case.

PiperOrigin-RevId: 295873865
Change-Id: I20ef4ea4c70fd00c2a2d1fc75730ac0fbd807faf
---
 .../internal/optimized/depthwiseconv_3x3_filter_common.h | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
index e27e3d7b272..f7860e29e69 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
@@ -453,7 +453,6 @@ inline bool Fast3x3FilterKernelSupported(
   const int32 filter_width = filter_shape.Dims(2);
   const int32 output_height = output_shape.Dims(1);
   const int32 output_width = output_shape.Dims(2);
-  const int32 output_depth = output_shape.Dims(3);
 
   bool supported =
       filter_width == 3 && filter_height == 3 && depth_multiplier == 1 &&
@@ -468,14 +467,6 @@ inline bool Fast3x3FilterKernelSupported(
     return false;
   }
 
-  if (quantization_type == QuantizationType::kPerChannelInt8) {
-    for (int i = 0; i < output_depth; ++i) {
-      if (output_shift_ptr[i] > 0) {
-        return false;
-      }
-    }
-  }
-
   // Handle case where padding is zero but padding type is not kValid.
   // This would require special boundary case handling that is not supported.
 

From ee7642b2670e33a45cc3a6f6585cfab7f7d4f8f6 Mon Sep 17 00:00:00 2001
From: Dayeong Lee <dayeongl@google.com>
Date: Tue, 18 Feb 2020 19:14:52 -0800
Subject: [PATCH 206/442] Move the writer functions of profileSummarizer to
 ProfileSummaryFormatter.

PiperOrigin-RevId: 295875892
Change-Id: Ie27c735012f1337b848e94548ac26aea5b8770b6
---
 tensorflow/lite/profiling/BUILD               |  22 +++
 .../lite/profiling/profile_summarizer.cc      |  64 ++-----
 .../lite/profiling/profile_summarizer.h       |  23 +--
 .../profiling/profile_summary_formatter.cc    |  97 +++++++++++
 .../profiling/profile_summary_formatter.h     |  84 +++++++++
 .../profile_summary_formatter_test.cc         | 164 ++++++++++++++++++
 tensorflow/lite/tools/benchmark/BUILD         |   1 +
 .../tools/benchmark/benchmark_tflite_model.cc |   1 +
 .../tools/benchmark/benchmark_tflite_model.h  |   1 +
 .../tools/benchmark/profiling_listener.cc     |  13 +-
 .../lite/tools/benchmark/profiling_listener.h |   7 +-
 11 files changed, 410 insertions(+), 67 deletions(-)
 create mode 100644 tensorflow/lite/profiling/profile_summary_formatter.cc
 create mode 100644 tensorflow/lite/profiling/profile_summary_formatter.h
 create mode 100644 tensorflow/lite/profiling/profile_summary_formatter_test.cc

diff --git a/tensorflow/lite/profiling/BUILD b/tensorflow/lite/profiling/BUILD
index 94c6a3c6613..ac957590c21 100644
--- a/tensorflow/lite/profiling/BUILD
+++ b/tensorflow/lite/profiling/BUILD
@@ -112,6 +112,27 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "profile_summary_formatter",
+    srcs = ["profile_summary_formatter.cc"],
+    hdrs = ["profile_summary_formatter.h"],
+    copts = common_copts,
+    deps = [
+        "//tensorflow/core/util:stats_calculator_portable",
+    ],
+)
+
+cc_test(
+    name = "profile_summary_formatter_test",
+    srcs = ["profile_summary_formatter_test.cc"],
+    copts = common_copts,
+    deps = [
+        ":profile_summary_formatter",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "profile_summarizer",
     srcs = ["profile_summarizer.cc"],
@@ -120,6 +141,7 @@ cc_library(
     deps = [
         ":memory_info",
         ":profile_buffer",
+        ":profile_summary_formatter",
         "//tensorflow/core/util:stats_calculator_portable",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/schema:schema_fbs",
diff --git a/tensorflow/lite/profiling/profile_summarizer.cc b/tensorflow/lite/profiling/profile_summarizer.cc
index 8f14efbb345..a4c763e4b28 100644
--- a/tensorflow/lite/profiling/profile_summarizer.cc
+++ b/tensorflow/lite/profiling/profile_summarizer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/profiling/profile_summarizer.h"
 
+#include <memory>
 #include <sstream>
 
 #include "tensorflow/lite/profiling/memory_info.h"
@@ -85,29 +86,21 @@ OperatorDetails GetOperatorDetails(const tflite::Interpreter& interpreter,
   return details;
 }
 
-tensorflow::StatSummarizerOptions GetProfileSummarizerOptions(
-    bool format_as_csv) {
-  auto options = tensorflow::StatSummarizerOptions();
-  // Summary will be manually handled per subgraphs in order to keep the
-  // compatibility.
-  options.show_summary = false;
-  options.show_memory = false;
-  options.format_as_csv = format_as_csv;
-  return options;
-}
-
 }  // namespace
 
-ProfileSummarizer::ProfileSummarizer(bool format_as_csv)
-    : delegate_stats_calculator_(new tensorflow::StatsCalculator(
-          GetProfileSummarizerOptions(format_as_csv))),
-      format_as_csv_(format_as_csv) {
+ProfileSummarizer::ProfileSummarizer(
+    std::unique_ptr<ProfileSummaryFormatter> summary_formatter)
+    : summary_formatter_(std::move(summary_formatter)) {
   // Create stats calculator for the primary graph.
   stats_calculator_map_[0] = std::unique_ptr<tensorflow::StatsCalculator>(
       new tensorflow::StatsCalculator(
-          GetProfileSummarizerOptions(format_as_csv)));
-}
+          summary_formatter_->GetStatSummarizerOptions()));
 
+  // Create stats calculator for the delegation op.
+  delegate_stats_calculator_ = std::unique_ptr<tensorflow::StatsCalculator>(
+      new tensorflow::StatsCalculator(
+          summary_formatter_->GetStatSummarizerOptions()));
+}
 void ProfileSummarizer::ProcessProfiles(
     const std::vector<const ProfileEvent*>& profile_stats,
     const tflite::Interpreter& interpreter) {
@@ -214,45 +207,10 @@ tensorflow::StatsCalculator* ProfileSummarizer::GetStatsCalculator(
     stats_calculator_map_[subgraph_index] =
         std::unique_ptr<tensorflow::StatsCalculator>(
             new tensorflow::StatsCalculator(
-                GetProfileSummarizerOptions(format_as_csv_)));
+                summary_formatter_->GetStatSummarizerOptions()));
   }
   return stats_calculator_map_[subgraph_index].get();
 }
 
-std::string ProfileSummarizer::GenerateReport(std::string tag,
-                                              bool include_output_string) {
-  std::stringstream stream;
-  bool has_non_primary_graph =
-      (stats_calculator_map_.size() - stats_calculator_map_.count(0)) > 0;
-  for (auto& stats_calc : stats_calculator_map_) {
-    auto subgraph_index = stats_calc.first;
-    auto subgraph_stats = stats_calc.second.get();
-    if (has_non_primary_graph) {
-      if (subgraph_index == 0)
-        stream << "Primary graph " << tag << ":" << std::endl;
-      else
-        stream << "Subgraph (index: " << subgraph_index << ") " << tag << ":"
-               << std::endl;
-    }
-    if (include_output_string) {
-      stream << subgraph_stats->GetOutputString();
-    }
-    if (subgraph_index != 0) {
-      stream << "Subgraph (index: " << subgraph_index << ") ";
-    }
-    stream << subgraph_stats->GetShortSummary() << std::endl;
-  }
-
-  if (delegate_stats_calculator_->num_runs() > 0) {
-    stream << "Delegate internal: " << std::endl;
-    if (include_output_string) {
-      stream << delegate_stats_calculator_->GetOutputString();
-    }
-    stream << delegate_stats_calculator_->GetShortSummary() << std::endl;
-  }
-
-  return stream.str();
-}
-
 }  // namespace profiling
 }  // namespace tflite
diff --git a/tensorflow/lite/profiling/profile_summarizer.h b/tensorflow/lite/profiling/profile_summarizer.h
index cb23f25385b..1348761b792 100644
--- a/tensorflow/lite/profiling/profile_summarizer.h
+++ b/tensorflow/lite/profiling/profile_summarizer.h
@@ -17,11 +17,13 @@ limitations under the License.
 #define TENSORFLOW_LITE_PROFILING_PROFILE_SUMMARIZER_H_
 
 #include <functional>
+#include <memory>
 #include <vector>
 
 #include "tensorflow/core/util/stats_calculator.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/profiling/profile_buffer.h"
+#include "tensorflow/lite/profiling/profile_summary_formatter.h"
 
 namespace tflite {
 namespace profiling {
@@ -29,21 +31,25 @@ namespace profiling {
 // Creates a summary of operator invocations in the interpreter.
 class ProfileSummarizer {
  public:
-  explicit ProfileSummarizer(bool format_as_csv = false);
+  explicit ProfileSummarizer(
+      std::unique_ptr<ProfileSummaryFormatter> summary_formatter =
+          std::make_unique<ProfileSummaryDefaultFormatter>());
   virtual ~ProfileSummarizer() {}
 
   // Process profile events to update statistics for operator invocations.
   void ProcessProfiles(const std::vector<const ProfileEvent*>& profile_stats,
                        const tflite::Interpreter& interpreter);
 
-  // Returns a string detailing the accumulated runtime stats in a tab-separated
-  // format which can be pasted into a spreadsheet for further analysis.
+  // Returns a string detailing the accumulated runtime stats in the format of
+  // summary_formatter_.
   std::string GetOutputString() {
-    return GenerateReport("profile", /*include_output_string*/ true);
+    return summary_formatter_->GetOutputString(stats_calculator_map_,
+                                               *delegate_stats_calculator_);
   }
 
   std::string GetShortSummary() {
-    return GenerateReport("summary", /*include_output_string*/ false);
+    return summary_formatter_->GetShortSummary(stats_calculator_map_,
+                                               *delegate_stats_calculator_);
   }
 
   tensorflow::StatsCalculator* GetStatsCalculator(uint32_t subgraph_index);
@@ -63,11 +69,8 @@ class ProfileSummarizer {
 
   std::unique_ptr<tensorflow::StatsCalculator> delegate_stats_calculator_;
 
-  // GenerateReport returns the report of subgraphs in a string format.
-  std::string GenerateReport(std::string tag, bool include_output_string);
-
-  // Whether output is formatted as CSV.
-  bool format_as_csv_ = false;
+  // Summary formatter for customized output formats.
+  std::unique_ptr<ProfileSummaryFormatter> summary_formatter_;
 };
 
 }  // namespace profiling
diff --git a/tensorflow/lite/profiling/profile_summary_formatter.cc b/tensorflow/lite/profiling/profile_summary_formatter.cc
new file mode 100644
index 00000000000..63023432ee7
--- /dev/null
+++ b/tensorflow/lite/profiling/profile_summary_formatter.cc
@@ -0,0 +1,97 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/profiling/profile_summary_formatter.h"
+
+#include <memory>
+#include <sstream>
+
+namespace tflite {
+namespace profiling {
+
+std::string ProfileSummaryDefaultFormatter::GetOutputString(
+    const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
+        stats_calculator_map,
+    const tensorflow::StatsCalculator& delegate_stats_calculator) const {
+  return GenerateReport("profile", /*include_output_string*/ true,
+                        stats_calculator_map, delegate_stats_calculator);
+}
+
+std::string ProfileSummaryDefaultFormatter::GetShortSummary(
+    const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
+        stats_calculator_map,
+    const tensorflow::StatsCalculator& delegate_stats_calculator) const {
+  return GenerateReport("summary", /*include_output_string*/ false,
+                        stats_calculator_map, delegate_stats_calculator);
+}
+
+std::string ProfileSummaryDefaultFormatter::GenerateReport(
+    const std::string& tag, bool include_output_string,
+    const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
+        stats_calculator_map,
+    const tensorflow::StatsCalculator& delegate_stats_calculator) const {
+  std::stringstream stream;
+  bool has_non_primary_graph =
+      (stats_calculator_map.size() - stats_calculator_map.count(0)) > 0;
+  for (const auto& stats_calc : stats_calculator_map) {
+    auto subgraph_index = stats_calc.first;
+    auto subgraph_stats = stats_calc.second.get();
+    if (has_non_primary_graph) {
+      if (subgraph_index == 0) {
+        stream << "Primary graph " << tag << ":" << std::endl;
+      } else {
+        stream << "Subgraph (index: " << subgraph_index << ") " << tag << ":"
+               << std::endl;
+      }
+    }
+    if (include_output_string) {
+      stream << subgraph_stats->GetOutputString();
+    }
+    if (subgraph_index != 0) {
+      stream << "Subgraph (index: " << subgraph_index << ") ";
+    }
+    stream << subgraph_stats->GetShortSummary() << std::endl;
+  }
+
+  if (delegate_stats_calculator.num_runs() > 0) {
+    stream << "Delegate internal: " << std::endl;
+    if (include_output_string) {
+      stream << delegate_stats_calculator.GetOutputString();
+    }
+    stream << delegate_stats_calculator.GetShortSummary() << std::endl;
+  }
+
+  return stream.str();
+}
+
+tensorflow::StatSummarizerOptions
+ProfileSummaryDefaultFormatter::GetStatSummarizerOptions() const {
+  auto options = tensorflow::StatSummarizerOptions();
+  // Summary will be manually handled per subgraphs in order to keep the
+  // compatibility.
+  options.show_summary = false;
+  options.show_memory = false;
+  return options;
+}
+
+tensorflow::StatSummarizerOptions
+ProfileSummaryCSVFormatter::GetStatSummarizerOptions() const {
+  auto options = ProfileSummaryDefaultFormatter::GetStatSummarizerOptions();
+  options.format_as_csv = true;
+  return options;
+}
+
+}  // namespace profiling
+}  // namespace tflite
diff --git a/tensorflow/lite/profiling/profile_summary_formatter.h b/tensorflow/lite/profiling/profile_summary_formatter.h
new file mode 100644
index 00000000000..8f6f9f33e46
--- /dev/null
+++ b/tensorflow/lite/profiling/profile_summary_formatter.h
@@ -0,0 +1,84 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PROFILING_PROFILE_SUMMARY_FORMATTER_H_
+#define TENSORFLOW_LITE_PROFILING_PROFILE_SUMMARY_FORMATTER_H_
+
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/util/stats_calculator.h"
+
+namespace tflite {
+namespace profiling {
+
+// Formats the profile summary in a certain way.
+class ProfileSummaryFormatter {
+ public:
+  ProfileSummaryFormatter() {}
+  virtual ~ProfileSummaryFormatter() {}
+  // Returns a string detailing the accumulated runtime stats in StatsCalculator
+  // of ProfileSummarizer.
+  virtual std::string GetOutputString(
+      const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
+          stats_calculator_map,
+      const tensorflow::StatsCalculator& delegate_stats_calculator) const = 0;
+  // Returns a string detailing the short summary of the the accumulated runtime
+  // stats in StatsCalculator of ProfileSummarizer.
+  virtual std::string GetShortSummary(
+      const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
+          stats_calculator_map,
+      const tensorflow::StatsCalculator& delegate_stats_calculator) const = 0;
+  virtual tensorflow::StatSummarizerOptions GetStatSummarizerOptions()
+      const = 0;
+};
+
+class ProfileSummaryDefaultFormatter : public ProfileSummaryFormatter {
+ public:
+  ProfileSummaryDefaultFormatter() {}
+  ~ProfileSummaryDefaultFormatter() override {}
+  std::string GetOutputString(
+      const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
+          stats_calculator_map,
+      const tensorflow::StatsCalculator& delegate_stats_calculator)
+      const override;
+  std::string GetShortSummary(
+      const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
+          stats_calculator_map,
+      const tensorflow::StatsCalculator& delegate_stats_calculator)
+      const override;
+  tensorflow::StatSummarizerOptions GetStatSummarizerOptions() const override;
+
+ private:
+  std::string GenerateReport(
+      const std::string& tag, bool include_output_string,
+      const std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>&
+          stats_calculator_map,
+      const tensorflow::StatsCalculator& delegate_stats_calculator) const;
+};
+
+class ProfileSummaryCSVFormatter : public ProfileSummaryDefaultFormatter {
+ public:
+  ProfileSummaryCSVFormatter() {}
+  tensorflow::StatSummarizerOptions GetStatSummarizerOptions() const override;
+};
+
+}  // namespace profiling
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PROFILING_PROFILE_SUMMARY_FORMATTER_H_
diff --git a/tensorflow/lite/profiling/profile_summary_formatter_test.cc b/tensorflow/lite/profiling/profile_summary_formatter_test.cc
new file mode 100644
index 00000000000..78d46aae1ea
--- /dev/null
+++ b/tensorflow/lite/profiling/profile_summary_formatter_test.cc
@@ -0,0 +1,164 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/profiling/profile_summary_formatter.h"
+
+#include <memory>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/testing/util.h"
+
+namespace tflite {
+namespace profiling {
+
+namespace {
+
+TEST(SummaryWriterTest, SummaryOptionStdOut) {
+  ProfileSummaryDefaultFormatter writer;
+  tensorflow::StatSummarizerOptions options = writer.GetStatSummarizerOptions();
+  EXPECT_EQ(options.show_summary, false);
+  EXPECT_EQ(options.show_memory, false);
+  EXPECT_EQ(options.format_as_csv, false);
+}
+
+TEST(SummaryWriterTest, SummaryOptionCSV) {
+  ProfileSummaryCSVFormatter writer;
+  tensorflow::StatSummarizerOptions options = writer.GetStatSummarizerOptions();
+  EXPECT_EQ(options.show_summary, false);
+  EXPECT_EQ(options.show_memory, false);
+  EXPECT_EQ(options.format_as_csv, true);
+}
+TEST(SummaryWriterTest, EmptyOutputString) {
+  ProfileSummaryDefaultFormatter writer;
+  std::string output = writer.GetOutputString(
+      std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>(),
+      tensorflow::StatsCalculator(writer.GetStatSummarizerOptions()));
+  EXPECT_EQ(output.size(), 0);
+}
+
+TEST(SummaryWriterTest, EmptyShortSummary) {
+  ProfileSummaryDefaultFormatter writer;
+  std::string output = writer.GetShortSummary(
+      std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>(),
+      tensorflow::StatsCalculator(writer.GetStatSummarizerOptions()));
+  EXPECT_EQ(output.size(), 0);
+}
+
+TEST(SummaryWriterTest, SingleSubgraphOutputString) {
+  ProfileSummaryDefaultFormatter writer;
+  std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>
+      stats_calculator_map;
+  stats_calculator_map[0] = std::make_unique<tensorflow::StatsCalculator>(
+      writer.GetStatSummarizerOptions());
+  std::string output = writer.GetOutputString(
+      stats_calculator_map,
+      tensorflow::StatsCalculator(writer.GetStatSummarizerOptions()));
+  ASSERT_TRUE(output.find("Run Order") != std::string::npos);
+  ASSERT_TRUE(output.find("Top by Computation Time") != std::string::npos);
+  ASSERT_TRUE(output.find("Top by Memory Use") == std::string::npos);
+  ASSERT_TRUE(output.find("Summary by node type") != std::string::npos);
+  ASSERT_TRUE(output.find("nodes observed") != std::string::npos);
+  ASSERT_TRUE(output.find("Primary graph") == std::string::npos);
+  ASSERT_TRUE(output.find("Subgraph") == std::string::npos);
+  ASSERT_TRUE(output.find("Delegate internal") == std::string::npos);
+}
+
+TEST(SummaryWriterTest, SingleSubgraphShortSummary) {
+  ProfileSummaryDefaultFormatter writer;
+  std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>
+      stats_calculator_map;
+  stats_calculator_map[0] = std::make_unique<tensorflow::StatsCalculator>(
+      writer.GetStatSummarizerOptions());
+  std::string output = writer.GetShortSummary(
+      stats_calculator_map,
+      tensorflow::StatsCalculator(writer.GetStatSummarizerOptions()));
+  ASSERT_TRUE(output.find("Run Order") == std::string::npos);
+  ASSERT_TRUE(output.find("Top by Computation Time") == std::string::npos);
+  ASSERT_TRUE(output.find("Top by Memory Use") == std::string::npos);
+  ASSERT_TRUE(output.find("Summary by node type") == std::string::npos);
+  ASSERT_TRUE(output.find("nodes observed") != std::string::npos);
+  ASSERT_TRUE(output.find("Primary graph") == std::string::npos);
+  ASSERT_TRUE(output.find("Subgraph") == std::string::npos);
+  ASSERT_TRUE(output.find("Delegate internal") == std::string::npos);
+}
+
+TEST(SummaryWriterTest, MultiSubgraphOutputString) {
+  ProfileSummaryDefaultFormatter writer;
+  std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>
+      stats_calculator_map;
+  stats_calculator_map[0] = std::make_unique<tensorflow::StatsCalculator>(
+      writer.GetStatSummarizerOptions());
+  stats_calculator_map[1] = std::make_unique<tensorflow::StatsCalculator>(
+      writer.GetStatSummarizerOptions());
+  std::string output = writer.GetOutputString(
+      stats_calculator_map,
+      tensorflow::StatsCalculator(writer.GetStatSummarizerOptions()));
+  ASSERT_TRUE(output.find("Primary graph") != std::string::npos);
+  ASSERT_TRUE(output.find("Subgraph") != std::string::npos);
+  ASSERT_TRUE(output.find("Delegate internal") == std::string::npos);
+}
+
+TEST(SummaryWriterTest, MultiSubgraphShortSummary) {
+  ProfileSummaryDefaultFormatter writer;
+  std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>
+      stats_calculator_map;
+  stats_calculator_map[0] = std::make_unique<tensorflow::StatsCalculator>(
+      writer.GetStatSummarizerOptions());
+  stats_calculator_map[1] = std::make_unique<tensorflow::StatsCalculator>(
+      writer.GetStatSummarizerOptions());
+  std::string output = writer.GetShortSummary(
+      stats_calculator_map,
+      tensorflow::StatsCalculator(writer.GetStatSummarizerOptions()));
+  ASSERT_TRUE(output.find("Primary graph") != std::string::npos);
+  ASSERT_TRUE(output.find("Subgraph") != std::string::npos);
+  ASSERT_TRUE(output.find("Delegate internal") == std::string::npos);
+}
+
+TEST(SummaryWriterTest, DelegationOutputString) {
+  ProfileSummaryDefaultFormatter writer;
+  auto delegate_stats_calculator =
+      tensorflow::StatsCalculator(writer.GetStatSummarizerOptions());
+  delegate_stats_calculator.UpdateRunTotalUs(1);
+  std::string output = writer.GetOutputString(
+      std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>(),
+      delegate_stats_calculator);
+  ASSERT_TRUE(output.find("Primary graph") == std::string::npos);
+  ASSERT_TRUE(output.find("Subgraph") == std::string::npos);
+  ASSERT_TRUE(output.find("Delegate internal") != std::string::npos);
+}
+
+TEST(SummaryWriterTest, DelegationShortSummary) {
+  ProfileSummaryDefaultFormatter writer;
+  auto delegate_stats_calculator =
+      tensorflow::StatsCalculator(writer.GetStatSummarizerOptions());
+  delegate_stats_calculator.UpdateRunTotalUs(1);
+  std::string output = writer.GetShortSummary(
+      std::map<uint32_t, std::unique_ptr<tensorflow::StatsCalculator>>(),
+      delegate_stats_calculator);
+  ASSERT_TRUE(output.find("Primary graph") == std::string::npos);
+  ASSERT_TRUE(output.find("Subgraph") == std::string::npos);
+  ASSERT_TRUE(output.find("Delegate internal") != std::string::npos);
+}
+
+}  // namespace
+}  // namespace profiling
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index df3194ff7e6..72968fc8e24 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -148,6 +148,7 @@ cc_library(
         "//tensorflow/lite/experimental/ruy/profiler",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/profiling:profiler",
+        "//tensorflow/lite/profiling:profile_summary_formatter",
         "//tensorflow/lite/tools/evaluation:utils",
     ] + select({
         "//tensorflow:fuchsia": [],
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 064eca0022f..23b76a921c5 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/profiling/profile_summary_formatter.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_utils.h"
 #include "tensorflow/lite/tools/benchmark/delegate_provider.h"
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
index a0bcce843ab..1d056bdf0cf 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/profiling/profile_summary_formatter.h"
 #include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_model.h"
 
diff --git a/tensorflow/lite/tools/benchmark/profiling_listener.cc b/tensorflow/lite/tools/benchmark/profiling_listener.cc
index a04015219ea..8d7a0fe3537 100644
--- a/tensorflow/lite/tools/benchmark/profiling_listener.cc
+++ b/tensorflow/lite/tools/benchmark/profiling_listener.cc
@@ -22,11 +22,11 @@ namespace benchmark {
 
 ProfilingListener::ProfilingListener(Interpreter* interpreter,
                                      uint32_t max_num_entries,
-                                     std::string csv_file_path)
+                                     const std::string& csv_file_path)
     : interpreter_(interpreter),
       profiler_(max_num_entries),
-      run_summarizer_(!csv_file_path.empty()),
-      init_summarizer_(!csv_file_path.empty()),
+      run_summarizer_(CreateProfileSummaryFormatter(!csv_file_path.empty())),
+      init_summarizer_(CreateProfileSummaryFormatter(!csv_file_path.empty())),
       csv_file_path_(csv_file_path) {
   TFLITE_BENCHMARK_CHECK(interpreter);
   interpreter_->SetProfiler(&profiler_);
@@ -85,5 +85,12 @@ void ProfilingListener::WriteOutput(const std::string& header,
   (*stream) << data << std::endl;
 }
 
+std::unique_ptr<profiling::ProfileSummaryFormatter>
+ProfilingListener::CreateProfileSummaryFormatter(bool format_as_csv) const {
+  return format_as_csv
+             ? std::make_unique<profiling::ProfileSummaryDefaultFormatter>()
+             : std::make_unique<profiling::ProfileSummaryCSVFormatter>();
+}
+
 }  // namespace benchmark
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/profiling_listener.h b/tensorflow/lite/tools/benchmark/profiling_listener.h
index 84ef70d800d..9c0f6745bbb 100644
--- a/tensorflow/lite/tools/benchmark/profiling_listener.h
+++ b/tensorflow/lite/tools/benchmark/profiling_listener.h
@@ -27,7 +27,7 @@ namespace benchmark {
 class ProfilingListener : public BenchmarkListener {
  public:
   explicit ProfilingListener(Interpreter* interpreter, uint32_t max_num_entries,
-                             std::string csv_file_path = "");
+                             const std::string& csv_file_path = "");
 
   void OnBenchmarkStart(const BenchmarkParams& params) override;
 
@@ -37,6 +37,11 @@ class ProfilingListener : public BenchmarkListener {
 
   void OnBenchmarkEnd(const BenchmarkResults& results) override;
 
+ protected:
+  // Allow subclasses to create a customized summary writer during init.
+  virtual std::unique_ptr<profiling::ProfileSummaryFormatter>
+  CreateProfileSummaryFormatter(bool format_as_csv) const;
+
  private:
   void WriteOutput(const std::string& header, const string& data,
                    std::ostream* stream);

From 6cb8ec0e317895199ff363d81a54cf305634f363 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Tue, 18 Feb 2020 20:23:10 -0800
Subject: [PATCH 207/442] Do not issue an error to TensorFlow when MLIR issues
 a warning

PiperOrigin-RevId: 295883589
Change-Id: Id5959647363e8f894585f185a9e4e1ca07065c35
---
 tensorflow/compiler/mlir/tensorflow/utils/error_util.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/utils/error_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/error_util.cc
index 2181f4f8c9b..60646ae764e 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/error_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/error_util.cc
@@ -63,21 +63,21 @@ Status StatusScopedDiagnosticHandler::Combine(Status status) {
 }
 
 LogicalResult StatusScopedDiagnosticHandler::handler(Diagnostic* diag) {
-#ifndef NDEBUG
+  // Non-error diagnostic are ignored when VLOG isn't enabled.
+  if (diag->getSeverity() != DiagnosticSeverity::Error && VLOG_IS_ON(1))
+    return success();
+
   size_t current_diag_str_size_ = diag_str_.size();
-#endif
 
   // Emit the diagnostic and flush the stream.
   emitDiagnostic(*diag);
   diag_stream_.flush();
 
-#ifndef NDEBUG
   // Emit non-errors to VLOG instead of the internal status.
   if (diag->getSeverity() != DiagnosticSeverity::Error) {
     VLOG(1) << diag_str_.substr(current_diag_str_size_);
     diag_str_.resize(current_diag_str_size_);
   }
-#endif
 
   // Return failure to signal propagation if necessary.
   return failure(propagate_);

From f9e9fb9de2af3a3b88c94287f1660709ed39fabb Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Tue, 18 Feb 2020 20:26:53 -0800
Subject: [PATCH 208/442] Fix tpuv1_outline_tpu_island to handle transitive
 function calls

PiperOrigin-RevId: 295883908
Change-Id: I384bed4144942ebf31b1b3875513fe9e1bae8019
---
 .../executor_tpuv1_outline_tpu_island.mlir    |  0
 .../while_op.mlir                             | 48 +++++++++++++++++++
 .../executor_tpuv1_outline_tpu_island.cc      | 18 ++++++-
 3 files changed, 64 insertions(+), 2 deletions(-)
 rename tensorflow/compiler/mlir/tensorflow/tests/{ => executor_tpuv1_outline_island}/executor_tpuv1_outline_tpu_island.mlir (100%)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/while_op.mlir

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_tpu_island.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/executor_tpuv1_outline_tpu_island.mlir
similarity index 100%
rename from tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_tpu_island.mlir
rename to tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/executor_tpuv1_outline_tpu_island.mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/while_op.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/while_op.mlir
new file mode 100644
index 00000000000..b1dee63ca03
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/while_op.mlir
@@ -0,0 +1,48 @@
+// RUN: tf-opt %s -tf-executor-tpu-v1-island-outlining | FileCheck %s --dump-input=fail
+
+// CHECK: func @control_input
+// CHECK-NOT: func @
+// CHECK-LABEL: module @_tpu_v1_compat_outlined
+// CHECK: @_tpu_v1_compat_outlined_func0
+// CHECK: func @while_body_with_cluster_attr
+// CHECK: func @while_cond_with_cluster_attr
+// CHECK: func @while_body_without_cluster_attr
+// CHECK: func @while_cond_without_cluster_attr
+// CHECK: func @callee_func
+module {
+  func @control_input(%arg0: tensor<i1>) -> tensor<i32> {
+    %0:4 = tf_executor.graph {
+      %outputs:4, %control = tf_executor.island {
+       "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", device = "device", num_replicas = 1, topology = "topology"} : () -> ()
+        %1 = "tf.opA"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i1>) -> tensor<i32>
+        %2 = "tf.While"(%1) {body = @while_body_with_cluster_attr, cond = @while_cond_with_cluster_attr, is_stateless = false, name = "A", parallel_iterations = 10 : i64} : (tensor<i32>) -> tensor<i32>
+        %3 = "tf.While"(%1) {body = @while_body_without_cluster_attr, cond = @while_cond_with_cluster_attr, is_stateless = false, name = "C", parallel_iterations = 10 : i64} : (tensor<i32>) -> tensor<i32>
+        %4 = "tf.While"(%1) {body = @while_body_with_cluster_attr, cond = @while_cond_without_cluster_attr, is_stateless = false, name = "E", parallel_iterations = 10 : i64} : (tensor<i32>) -> tensor<i32>
+        tf_executor.yield %1, %2, %3, %4 : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>
+      }
+      tf_executor.fetch %outputs#0, %outputs#1, %outputs#2, %outputs#3 : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>
+
+    }
+    return %0#0 : tensor<i32>
+  }
+  func @while_body_with_cluster_attr(%arg0: tensor<i32>) -> tensor<i32> {
+    %0 = "some.op"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+  func @while_cond_with_cluster_attr(%arg0: tensor<i32>) -> tensor<i1> {
+    %0 = "some.op"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+  func @while_body_without_cluster_attr(%arg0: tensor<i32>) -> tensor<i32> {
+    %0 = "some.op"(%arg0) : (tensor<i32>) -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+  func @while_cond_without_cluster_attr(%arg0: tensor<i32>) -> tensor<i1> {
+    %0 = "tf.PartionedCalledOp"(%arg0) { f = @callee_func} : (tensor<i32>) -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+  func @callee_func(%arg0: tensor<i32>) -> tensor<i1> {
+    %0 = "some.op"(%arg0) : (tensor<i32>) -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc
index b553a74d097..57ea1822b5b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc
@@ -133,9 +133,23 @@ void TPUBridgeExecutorIslandOutlining::runOnModule() {
         /*executor_type=*/builder.getStringAttr(""));
     SmallVector<Value, 16> yield_operands(call_op.getResults());
     builder.create<YieldOp>(island_op.getLoc(), yield_operands);
+  }
 
-    // TODO(aminim): handle transitively referenced function and clone them in
-    // the new module.
+  // Outlined all the transitively called functions by moving them in the
+  // outlined module.
+  for (FuncOp func : outlined_module.getOps<FuncOp>()) {
+    func.walk([&](Operation *op) {
+      for (NamedAttribute attr : op->getAttrs()) {
+        auto symbol_ref = attr.second.dyn_cast<FlatSymbolRefAttr>();
+        if (!symbol_ref) continue;
+        if (outlined_symbol_table.lookup<FuncOp>(symbol_ref.getValue()))
+          continue;
+        FuncOp callee = symbol_table.lookup<FuncOp>(symbol_ref.getValue());
+        callee.getOperation()->getBlock()->getOperations().remove(
+            callee.getOperation());
+        outlined_symbol_table.insert(callee);
+      }
+    });
   }
 }
 

From 9771b11027394364e44d9d745bbcee924bfbba98 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Tue, 18 Feb 2020 20:43:05 -0800
Subject: [PATCH 209/442] Add support for called function in tpuv1 inlining
 pass

When a callgraph is involved, we need to inline back the called
functions as well before deleting the nested module.

PiperOrigin-RevId: 295885585
Change-Id: I61a4274e06a3009e97ca800cc2ed60591e522149
---
 .../compiler/mlir/tensorflow/ir/tf_device.cc  | 43 ++++++++++++++++++
 .../executor_tpuv1_inline_tpu_island.mlir     |  0
 .../while_op.mlir                             | 44 +++++++++++++++++++
 .../executor_tpuv1_inline_tpu_island.cc       | 11 +++++
 4 files changed, 98 insertions(+)
 rename tensorflow/compiler/mlir/tensorflow/tests/{ => executor_tpuv1_island_inlining}/executor_tpuv1_inline_tpu_island.mlir (100%)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_inlining/while_op.mlir

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
index 5c277eeb9db..c88ddaf7806 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
@@ -41,11 +41,52 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // TF:llvm-project
 #include "mlir/Support/LogicalResult.h"  // TF:llvm-project
 #include "mlir/Support/STLExtras.h"  // TF:llvm-project
+#include "mlir/Transforms/InliningUtils.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace mlir {
 namespace tf_device {
 
+//===----------------------------------------------------------------------===//
+// TF Device Dialect Interfaces
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct TFInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  //===--------------------------------------------------------------------===//
+  // Analysis Hooks
+  //===--------------------------------------------------------------------===//
+
+  // Defines the legality of inlining TF Device operations.
+  bool isLegalToInline(Operation*, Region*, BlockAndValueMapping&) const final {
+    // For now, enable inlining all operations.
+    return true;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Transformation Hooks
+  //===--------------------------------------------------------------------===//
+
+  // Attempts to materialize a conversion for a type mismatch between a call
+  // from this dialect, and a callable region. This method should generate an
+  // operation that takes 'input' as the only operand, and produces a single
+  // result of 'resultType'. If a conversion can not be generated, nullptr
+  // should be returned.
+  // This is just re-using the same logic as the TensorFlow dialect right now.
+  Operation* materializeCallConversion(OpBuilder& builder, Value input,
+                                       Type result_type,
+                                       Location conversion_loc) const final {
+    if (!result_type.isa<TensorType>() || !input.getType().isa<TensorType>())
+      return nullptr;
+    return builder.create<TF::CastOp>(conversion_loc, result_type, input,
+                                      /*truncate=*/builder.getBoolAttr(false));
+  }
+};
+}  // end anonymous namespace
+
 TensorFlowDeviceDialect::TensorFlowDeviceDialect(MLIRContext* context)
     : Dialect(/*name=*/"tf_device", context) {
   addOperations<
@@ -54,6 +95,8 @@ TensorFlowDeviceDialect::TensorFlowDeviceDialect(MLIRContext* context)
       >();
 
   addOperations<ParallelExecuteOp>();
+
+  addInterfaces<TFInlinerInterface>();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_inline_tpu_island.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_inlining/executor_tpuv1_inline_tpu_island.mlir
similarity index 100%
rename from tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_inline_tpu_island.mlir
rename to tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_inlining/executor_tpuv1_inline_tpu_island.mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_inlining/while_op.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_inlining/while_op.mlir
new file mode 100644
index 00000000000..010b5346e1e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_inlining/while_op.mlir
@@ -0,0 +1,44 @@
+// RUN: tf-opt %s -tf-executor-tpu-v1-island-inlining | FileCheck %s --dump-input=fail
+
+// CHECK-NOT: tf.PartitionedCall
+// CHECK-NOT: module @_tpu_v1_compat_outlined
+
+module {
+  func @control_input(%arg0: tensor<i1>) -> tensor<i32> {
+    %0:4 = tf_executor.graph {
+      %outputs:4, %control = tf_executor.island wraps "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @_tpu_v1_compat_outlined::@_tpu_v1_compat_outlined_func0} : (tensor<i1>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>)
+      tf_executor.fetch %outputs#0, %outputs#1, %outputs#2, %outputs#3 : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>
+    }
+    return %0#0 : tensor<i32>
+  }
+  module @_tpu_v1_compat_outlined {
+    func @_tpu_v1_compat_outlined_func0(%arg0: tensor<i1>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>) {
+      "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", device = "device", num_replicas = 1 : i64, topology = "topology"} : () -> ()
+      %0 = "tf.opA"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i1>) -> tensor<i32>
+      %1 = "tf.While"(%0) {body = @while_body_with_cluster_attr, cond = @while_cond_with_cluster_attr, is_stateless = false, name = "A", parallel_iterations = 10 : i64} : (tensor<i32>) -> tensor<i32>
+      %2 = "tf.While"(%0) {body = @while_body_without_cluster_attr, cond = @while_cond_with_cluster_attr, is_stateless = false, name = "C", parallel_iterations = 10 : i64} : (tensor<i32>) -> tensor<i32>
+      %3 = "tf.While"(%0) {body = @while_body_with_cluster_attr, cond = @while_cond_without_cluster_attr, is_stateless = false, name = "E", parallel_iterations = 10 : i64} : (tensor<i32>) -> tensor<i32>
+      return %0, %1, %2, %3 : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>
+    }
+    func @while_body_with_cluster_attr(%arg0: tensor<i32>) -> tensor<i32> {
+      %0 = "some.op"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
+      return %0 : tensor<i32>
+    }
+    func @while_cond_with_cluster_attr(%arg0: tensor<i32>) -> tensor<i1> {
+      %0 = "some.op"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i1>
+      return %0 : tensor<i1>
+    }
+    func @while_body_without_cluster_attr(%arg0: tensor<i32>) -> tensor<i32> {
+      %0 = "some.op"(%arg0) : (tensor<i32>) -> tensor<i32>
+      return %0 : tensor<i32>
+    }
+    func @while_cond_without_cluster_attr(%arg0: tensor<i32>) -> tensor<i1> {
+      %0 = "tf.PartionedCalledOp"(%arg0) {f = @callee_func} : (tensor<i32>) -> tensor<i1>
+      return %0 : tensor<i1>
+    }
+    func @callee_func(%arg0: tensor<i32>) -> tensor<i1> {
+      %0 = "some.op"(%arg0) : (tensor<i32>) -> tensor<i1>
+      return %0 : tensor<i1>
+    }
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_inline_tpu_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_inline_tpu_island.cc
index 80fcd52056d..9660367cb68 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_inline_tpu_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_inline_tpu_island.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
@@ -70,10 +71,20 @@ void TPUBridgeExecutorIslandInlining::runOnModule() {
       call_op.emitOpError() << "Failed to inline\n";
       return WalkResult::interrupt();
     }
+    called_func.erase();
     call_op.erase();
     return WalkResult::advance();
   });
   if (walk_result.wasInterrupted()) return signalPassFailure();
+  // Move all remaining nested functions back into the parent module.
+  Block &nested_block = nested_module->getRegion(0).front();
+  for (FuncOp func_op :
+       llvm::make_early_inc_range(nested_block.getOps<FuncOp>())) {
+    if (!symbol_table.lookupSymbolIn(getModule(), func_op.getName())) {
+      nested_block.getOperations().remove(func_op.getOperation());
+      symbol_table.insert(func_op.getOperation());
+    }
+  }
   nested_module->erase();
 }
 

From 823384e08f5326609a665f155b99d6ef20deca16 Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Tue, 18 Feb 2020 20:44:59 -0800
Subject: [PATCH 210/442] [Resubmit] Fix several issues of multi output fusion.

PiperOrigin-RevId: 295885785
Change-Id: I1e6350437987f63843181d704fe86660fd9cfb8c
---
 .../compiler/xla/service/hlo_computation.cc   |  2 ++
 .../compiler/xla/service/hlo_computation.h    | 10 ++++++
 .../compiler/xla/service/hlo_instruction.cc   |  6 +++-
 .../compiler/xla/service/hlo_instruction.h    | 10 +++++-
 tensorflow/compiler/xla/service/hlo_module.h  |  7 ++++
 .../compiler/xla/service/hlo_module_group.h   |  7 ++++
 .../compiler/xla/service/hlo_pass_pipeline.h  |  8 +++--
 .../xla/service/multi_output_fusion.cc        |  4 +--
 .../xla/service/multi_output_fusion.h         | 33 ++++++++++++++++---
 9 files changed, 77 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 122122aae55..22d9f1bc648 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -309,6 +309,8 @@ Status HloComputation::RemoveInstructionImpl(HloInstruction* instruction,
   auto inst_it = instruction_iterators_.find(instruction);
   TF_RET_CHECK(inst_it != instruction_iterators_.end());
   (*inst_it->second)->set_parent(nullptr);
+  to_be_deleted_.emplace_back(inst_it->second->release());
+  to_be_deleted_.back()->DetachFromOperandsAndUsers();
   instructions_.erase(inst_it->second);
   instruction_iterators_.erase(inst_it);
   return Status::OK();
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index 9ca60403929..f1568858d9f 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -469,6 +469,12 @@ class HloComputation {
 
   int64 unique_id() const { return unique_id_; }
 
+  // Deallocate instructions that are marked by "RemoveInstruction". The two
+  // stage clean up process is designed such that HloPass can have stable
+  // internal pointers to HloInstructions while we create and remove
+  // HloInstructions in a pass.
+  void Cleanup() { to_be_deleted_.clear(); }
+
  private:
   explicit HloComputation(
       const string& name, int parameter_count,
@@ -527,6 +533,10 @@ class HloComputation {
   absl::flat_hash_map<const HloInstruction*, InstructionList::iterator>
       instruction_iterators_;
 
+  // Removed instructions are moved into to_be_deleted_ first and then
+  // deallocated when Cleanup is called.
+  std::vector<std::unique_ptr<HloInstruction>> to_be_deleted_;
+
   std::vector<HloInstruction*> param_instructions_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(HloComputation);
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 9f45cac028c..8aeb92b40de 100755
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -1661,7 +1661,11 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
   return clone;
 }
 
-HloInstruction::~HloInstruction() {
+void HloInstruction::DetachFromOperandsAndUsers() {
+  if (cleaned_up_) {
+    return;
+  }
+  cleaned_up_ = true;
   // Detach from operands. An instruction may be repeated as an operand. To
   // avoid calling RemoveUser twice on the same operand, check before remove.
   for (int64 operand_num = 0; operand_num < operand_count(); ++operand_num) {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index a108a91d5f9..33c0daca686 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -480,7 +480,11 @@ class HloInstruction {
     kCustom,
   };
 
-  virtual ~HloInstruction();
+  virtual ~HloInstruction() { DetachFromOperandsAndUsers(); }
+
+  // Detaches an instruction from its operands and users. That is, remove the
+  // instruction from each operand's user set and user's operand set.
+  void DetachFromOperandsAndUsers();
 
   // Creates an instruction from the given proto. Arguments:
   //
@@ -2025,6 +2029,10 @@ class HloInstruction {
   // a default configuration.
   bool is_default_config_ = false;
 
+  // True if this instruction has already been detached from its user and
+  // operands.
+  bool cleaned_up_ = false;
+
   // String identifier for instruction.
   string name_;
 
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index 5e662e0bebc..f25f4694f21 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -184,6 +184,13 @@ class HloModule {
   // Gets the number of instructions in this module.
   int64 instruction_count() const;
 
+  // Deallocate removed instructions in each computation.
+  void Cleanup() {
+    for (auto& comp : computations_) {
+      comp->Cleanup();
+    }
+  }
+
   // Compute and return a post order of all computations in the module. The sort
   // is defined like so: if computation A has an instruction which calls
   // computation B, then A will appear after B in the sort.
diff --git a/tensorflow/compiler/xla/service/hlo_module_group.h b/tensorflow/compiler/xla/service/hlo_module_group.h
index c4b10f3b22a..217f65b4a75 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group.h
@@ -64,6 +64,13 @@ class HloModuleGroup {
 
   string ToString() const;
 
+  // Deallocate removed instructions in each module.
+  void Cleanup() {
+    for (auto& module : modules_) {
+      module->Cleanup();
+    }
+  }
+
   // Serialize the module group to/from a proto.
   HloModuleGroupProto ToProto() const;
   static StatusOr<HloModuleGroup> CreateFromProto(
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.h b/tensorflow/compiler/xla/service/hlo_pass_pipeline.h
index ad4070e3e23..16fad113b0d 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.h
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.h
@@ -104,11 +104,15 @@ class HloPassPipeline : public HloPassInterface {
   // helpers enable templating of the core of the pipeline logic by providing
   // HloModule and HloModuleGroup specific methods with the same name.
   static StatusOr<bool> RunHelper(HloPassInterface* pass, HloModule* module) {
-    return pass->Run(module);
+    TF_ASSIGN_OR_RETURN(bool changed, pass->Run(module));
+    module->Cleanup();
+    return changed;
   }
   static StatusOr<bool> RunHelper(HloPassInterface* pass,
                                   HloModuleGroup* module_group) {
-    return pass->RunOnModuleGroup(module_group);
+    TF_ASSIGN_OR_RETURN(bool changed, pass->RunOnModuleGroup(module_group));
+    module_group->Cleanup();
+    return changed;
   }
 
   const string name_;
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.cc b/tensorflow/compiler/xla/service/multi_output_fusion.cc
index a8a4b7ef872..d97893b6d04 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.cc
@@ -368,12 +368,12 @@ bool MultiOutputFusion::Perform() {
   int changed = false;
   // Pick the top candidate from queue and try to merge.
   while (!worklist_.empty()) {
-    ToBeFused candidate = worklist_.top();
-    worklist_.pop();
+    ToBeFused candidate = worklist_.pop();
 
     HloInstruction* instr1 = candidate.instr1;
     HloInstruction* instr2 = candidate.instr2;
 
+    // Candidates are already fused.
     if (is_fused(instr1) || is_fused(instr2)) {
       continue;
     }
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.h b/tensorflow/compiler/xla/service/multi_output_fusion.h
index 18069e2f76c..f0b56eeff90 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.h
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.h
@@ -136,9 +136,34 @@ class MultiOutputFusion : public HloModulePass {
     HloInstruction* instr1;
     HloInstruction* instr2;
     int64 score;
-    ToBeFused(HloInstruction* instr1, HloInstruction* instr2, int64 score)
-        : instr1(instr1), instr2(instr2), score(score) {}
-    bool operator<(const ToBeFused& rhs) const { return score < rhs.score; }
+    int64 timestamp;
+    ToBeFused(HloInstruction* instr1, HloInstruction* instr2, int64 score,
+              int64 timestamp)
+        : instr1(instr1), instr2(instr2), score(score), timestamp(timestamp) {}
+    bool operator<(const ToBeFused& rhs) const {
+      return std::pair<int64, int64>(score, timestamp) <
+             std::pair<int64, int64>(rhs.score, rhs.timestamp);
+    }
+  };
+
+  // Stable priority queue where each insertion has a timestamp for
+  // deterministic popping.
+  class WorkList {
+   public:
+    bool empty() { return worklist_.empty(); }
+    ToBeFused pop() {
+      ToBeFused tmp = worklist_.top();
+      worklist_.pop();
+      return tmp;
+    }
+    template <class... Args>
+    void emplace(Args&&... args) {
+      worklist_.emplace(std::forward<Args>(args)..., timestamp_++);
+    }
+
+   private:
+    std::priority_queue<ToBeFused> worklist_;
+    int64 timestamp_ = 0;
   };
 
   // Update the internal data structures before instr1 and instr2 are fused into
@@ -169,7 +194,7 @@ class MultiOutputFusion : public HloModulePass {
   }
 
   std::vector<FusionCandidate> candidates_;
-  std::priority_queue<ToBeFused> worklist_;
+  WorkList worklist_;
 
   // A map that maps an instruction to the index_.
   absl::flat_hash_map<HloInstruction*, int> candidates_index_;

From ae7a428bfad4598a5ba186ea65a402166ed55004 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 20:46:35 -0800
Subject: [PATCH 211/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 295885930 Change-Id:
 Ia5e57f85a1e62bd486fa10ac265044c72742fb0d

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index c744d5b466a..f69affe5e8a 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21345,7 +21345,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22053,7 +22053,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22249,7 +22249,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22318,7 +22318,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22433,7 +22433,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22492,7 +22492,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22666,7 +22666,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22857,7 +22857,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25297,7 +25297,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25629,7 +25629,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25679,7 +25679,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25929,7 +25929,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26559,7 +26559,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27624,7 +27624,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45491,7 +45491,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From eebf50dd9e0659d9c144d91c4675f17fb14a79c0 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Tue, 18 Feb 2020 20:57:20 -0800
Subject: [PATCH 212/442] Make test_run_in_graph_and_eager work with test
 combinations

PiperOrigin-RevId: 295887061
Change-Id: I83ca68a1e01ad124cc25dff071affdc8c6413b55
---
 tensorflow/python/BUILD                       | 1 +
 tensorflow/python/framework/test_util.py      | 2 +-
 tensorflow/python/framework/test_util_test.py | 6 ++++++
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 86a9530f337..dfed8ce0402 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -2531,6 +2531,7 @@ tf_py_test(
     deps = [
         ":control_flow_ops",
         ":errors",
+        ":framework_combinations",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index a225fd94100..d9daceb7314 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -1136,7 +1136,7 @@ def run_in_graph_and_eager_modes(func=None,
         run_eagerly(self, **kwargs)
       ops.dismantle_graph(graph_for_eager_test)
 
-    return decorated
+    return tf_decorator.make_decorator(f, decorated)
 
   if func is not None:
     return decorator(func)
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index eec7010fbdf..96f7d600713 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -33,6 +33,7 @@ from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
+from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -742,6 +743,11 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def test_run_in_graph_and_eager_works_with_parameterized_keyword(self, arg):
     self.assertEqual(arg, True)
 
+  @combinations.generate(combinations.combine(arg=True))
+  @test_util.run_in_graph_and_eager_modes
+  def test_run_in_graph_and_eager_works_with_combinations(self, arg):
+    self.assertEqual(arg, True)
+
   def test_build_as_function_and_v1_graph(self):
 
     class GraphModeAndFunctionTest(parameterized.TestCase):

From 7c1bc443faeb53fcf9a11bd7b3b4ee24a46974dd Mon Sep 17 00:00:00 2001
From: Blake Hechtman <blakehechtman@google.com>
Date: Tue, 18 Feb 2020 22:05:49 -0800
Subject: [PATCH 213/442] [XLA] Predicate Reduce(Dot(....)) under
 enable_dot_strength_reduction.

PiperOrigin-RevId: 295896500
Change-Id: I07fda5d17b160f8ea1492c71dee9b6d58204d50b
---
 tensorflow/compiler/xla/service/algebraic_simplifier.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 5f50c2b303b..cfbcb5a4fe2 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -3727,7 +3727,8 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
   // Convert Reduce(Dot(X,Y)) to Dot(X,Y) if any of the dimensions reduced were
   // batch dimensions of the dot. The transformation supports reducing other
   // dimensions as well.
-  if (Match(arg, m::Dot(&dot, m::Op(&lhs), m::Op(&rhs)).WithOneUser()) &&
+  if (options_.enable_dot_strength_reduction() &&
+      Match(arg, m::Dot(&dot, m::Op(&lhs), m::Op(&rhs)).WithOneUser()) &&
       Match(reduce->to_apply()->root_instruction(),
             m::Add(m::Parameter(), m::Parameter())) &&
       absl::c_any_of(reduce->dimensions(), [&](int64 dim) {

From b5ac5db07c16f7e3a59967591a2aae7669b0ef2d Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Tue, 18 Feb 2020 22:22:57 -0800
Subject: [PATCH 214/442] Add xla_lhlo::DynamicBroadcastInDimOp -> Linalg
 lowering.

Currently, it does not support extending a dimension of size 1 to N.

PiperOrigin-RevId: 295898134
Change-Id: I62e17e4948873a1c7ce35484ade0aec10bdb244f
---
 .../xla/tests/lhlo-legalize-to-linalg.mlir     | 16 ++++++++++++++++
 .../xla/transforms/xla_legalize_to_linalg.cc   | 18 ++++++++++--------
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
index 19e16ceab44..78f0d9ffb18 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
@@ -179,6 +179,22 @@ func @iota(%out: memref<7x10xi64>) {
 
 // -----
 
+// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d4, d0, d2)>
+// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
+// CHECK-LABEL: func @dynamic_broadcast
+func @dynamic_broadcast(%operand: memref<?x?x?xf32>,
+                        %result: memref<?x?x?x?x?xf32>) {
+  "xla_lhlo.broadcast_in_dim"(%operand, %result)
+    {broadcast_dimensions = dense<[4,0,2]> : tensor<3xi64>}
+    : (memref<?x?x?xf32>, memref<?x?x?x?x?xf32>) -> ()
+  return
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+// CHECK-NEXT: ^bb0(%[[OPERAND:.*]]: f32, %[[RESULT:.*]]: f32):
+// CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
+
+// -----
+
 // CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d4, d0, 0)>
 // CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
 // CHECK-LABEL: func @broadcast
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
index b6019b1e263..d07819284e5 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
@@ -227,19 +227,21 @@ class BroadcastInDimConverter
 
     unsigned nloops = resultMemrefType.getRank();
 
+    auto operandShape = operandMemrefType.getShape();
     SmallVector<AffineExpr, 4> dimExprs;
     {
       dimExprs.reserve(nloops);
+      for (const auto& broadcastDim : llvm::enumerate(
+               broadcastOp.broadcast_dimensions().getValue().getIntValues())) {
+        int dim = broadcastDim.value().getSExtValue();
 
-      auto operandShape = operandMemrefType.getShape();
-      int index = 0;
-      for (const auto& broadcastSize :
-           broadcastOp.broadcast_dimensions().getValue().getIntValues()) {
-        int size = broadcastSize.getSExtValue();
-        dimExprs.push_back(
-            operandShape[index++] == 1
+        // TODO(pifon): Add support for args with dynamic shapes for the case
+        // when a dimension of size 1 is broadcasted into dim of size N.
+        AffineExpr affineExpr =
+            operandShape[broadcastDim.index()] == 1
                 ? mlir::getAffineConstantExpr(0, broadcastOp.getContext())
-                : mlir::getAffineDimExpr(size, broadcastOp.getContext()));
+                : mlir::getAffineDimExpr(dim, broadcastOp.getContext());
+        dimExprs.push_back(affineExpr);
       }
     }
 

From f738cd59c39d8ba92b91f35ad0be5d8005216292 Mon Sep 17 00:00:00 2001
From: Haoyu Wu <haoyu.wu@wdc.com>
Date: Tue, 18 Feb 2020 22:55:55 -0800
Subject: [PATCH 215/442] Add leaky_relu operator property

---
 tensorflow/lite/tools/optimize/operator_property.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index 13f63092761..d6a42867230 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -792,6 +792,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.outputs = {{0, {}}};
       property.version = 2;
       break;
+    case BuiltinOperator_LEAKY_RELU:
     case BuiltinOperator_RELU:
     case BuiltinOperator_RELU6:
       property.inputs = {{0, {}}};

From 6ef3d651d733c8a2c7bff06d61abfa5bb96e4fe5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 18 Feb 2020 22:53:40 -0800
Subject: [PATCH 216/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 295900926 Change-Id:
 Ic07de0ba74049020a12344e9c9cd85d91afaba49

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index f69affe5e8a..c744d5b466a 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21345,7 +21345,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22053,7 +22053,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22249,7 +22249,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22318,7 +22318,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22433,7 +22433,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22492,7 +22492,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22666,7 +22666,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22857,7 +22857,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25297,7 +25297,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25629,7 +25629,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25679,7 +25679,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25929,7 +25929,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26559,7 +26559,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27624,7 +27624,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45491,7 +45491,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From e623eb0f9c1c65705f0cfb1c6cb1d8cb2649cdbb Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Tue, 18 Feb 2020 23:15:38 -0800
Subject: [PATCH 217/442] Fix all_reduce of IndexedSlices when there're
 multiple devices

The tf.cond needs to happen on every device, instead of just one. Otherwise,
there's no dependency on the gathering of the lengths on non-first devices.

This change also adds control dependencies between gathering values and
gathering indices, to make sure they're launched in a correct order.

github#33339

PiperOrigin-RevId: 295903094
Change-Id: I0a2c984d0ee5230b7bc7cf3ae513a69e0d32a56e
---
 tensorflow/python/distribute/BUILD            |   3 +-
 .../python/distribute/cross_device_ops.py     |  94 +------
 .../distribute/cross_device_ops_test.py       |   4 +
 .../python/distribute/cross_device_utils.py   | 229 ++++++++++++++----
 4 files changed, 205 insertions(+), 125 deletions(-)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 8ba5813cf16..e201cfa6dbb 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -789,8 +789,7 @@ cuda_py_test(
     name = "cross_device_ops_test",
     srcs = ["cross_device_ops_test.py"],
     tags = [
-        # TODO(b/138143527): Re-enable after fixing Guitar failure.
-        # "multi_and_single_gpu",
+        "multi_and_single_gpu",
     ],
     deps = [
         ":collective_all_reduce_strategy",
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index 7f6230e9404..3b5dff9a6f8 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -34,7 +34,6 @@ from tensorflow.python.framework import kernels
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import tf_logging as logging
@@ -1151,7 +1150,7 @@ class CollectiveAllReduce(CrossDeviceOps):
                 reduced_gv_list):
               control_input_grads = [g for g, _ in reduced_gv_list[-1]]
             else:
-              control_input_grads = []
+              control_input_grads = None
             collective_reduced = cross_device_utils.build_collective_reduce(
                 grads, self._num_workers, self._collective_keys, "Add", "Id",
                 communication_hint, control_input_grads)
@@ -1200,87 +1199,20 @@ class CollectiveAllReduce(CrossDeviceOps):
       # optimizer and packed into a single all-reduce.
       with ops.name_scope("allreduce"):
         for grad_and_vars in chunk:
-          # `grad_and_vars` contains gradients for the same variable but from
-          # different devices. Because current CollectiveAllGather
-          # implementations require input IndexedSlices to have consistent
-          # length across the board, we handle the reduction of IndexedSlices
-          # as follows:
-          #   1. Gather the lengths of IndexedSlices from all participants.
-          #   2. If they have consistent length, apply all_gather.
-          #   3. Otherwise convert IndexedSlices to dense tensors and apply
-          #      all_reduce.
+          grads = [g for g, _ in grad_and_vars]
 
-          def all_gather():
-            """Use all_gather to aggregate `IndexedSlices`."""
-            grads = [g for g, _ in grad_and_vars]  # pylint: disable=cell-var-from-loop
-            values = [g.values for g in grads]
-            indices = [g.indices for g in grads]
-
-            # Build two separate allgathers, one for values, the other one for
-            # indices.
-            gathered_values = cross_device_utils.build_collective_gather(
-                values, self._num_workers, self._collective_keys)
-            gathered_indices = cross_device_utils.build_collective_gather(
-                indices, self._num_workers, self._collective_keys)
-            assert len(gathered_values) == len(gathered_indices)
-
-            gathered_grads = []
-            for i in range(len(values)):
-              gathered_grad = ops.IndexedSlices(
-                  values=gathered_values[i],
-                  indices=gathered_indices[i],
-                  dense_shape=grads[i].dense_shape)
-              gathered_grads.append(gathered_grad)
-            return gathered_grads
-
-          def all_reduce():
-            """Use all_reduce to aggregate `IndexedSlices`."""
-            grads = []
-            for g, _ in grad_and_vars:  # pylint: disable=cell-var-from-loop
-              with ops.device(g.device):
-                grads.append(ops.convert_to_tensor(g))
-
-            reduced_dense_grads = cross_device_utils.build_collective_reduce(
-                grads, self._num_workers, self._collective_keys, "Add", "Id",
-                communication_hint)
-            # We have to convert dense grad to IndexedSlice because all_reduce()
-            # and all_gather() must have the same return type as required by
-            # control_flow_ops.cond.
-            reduced_grads = []
-            for grad in reduced_dense_grads:
-              reduced_grads.append(
-                  ops.IndexedSlices(
-                      values=grad,
-                      indices=math_ops.range(array_ops.shape(grad)[0]),
-                      dense_shape=array_ops.shape(grad)))
-            return reduced_grads
-
-          indexed_slice_lengths = []
-          for g, _ in grad_and_vars:
-            with ops.device(g.device):
-              indexed_slice_lengths.append(array_ops.shape(g.indices))
-          gathered_indexed_slice_lengths = (
-              cross_device_utils.build_collective_gather(
-                  indexed_slice_lengths, self._num_workers,
-                  self._collective_keys))
-          # gathered_indexed_slice_lengths takes the following forms:
-          # [[length1_on_gpu_0, length2_on_gpu0, ...],
-          #  [length1_on_gpu_1, length2_on_gpu1, ...]
-          #  ...
-          # ]
-          # Each sublist is value-wise identical but resides on different
-          # devices. Since each sublist has the same value, we can just use the
-          # first sublist to compute the condition.
-          collective_reduced = control_flow_ops.cond(
-              math_ops.equal(
-                  math_ops.reduce_max(gathered_indexed_slice_lengths[0]),
-                  math_ops.reduce_min(gathered_indexed_slice_lengths[0])),
-              all_gather, all_reduce)
-          # tf.cond implicitly unpacks singleton list to single value, hence
-          # we need to re-wrap the single value into a singleton list here.
-          if not isinstance(collective_reduced, list):
-            collective_reduced = [collective_reduced]
+          # Add control dependencies per device from the last gradients to the
+          # current set, in order to serialize NCCL launches.
+          if (communication_hint == CollectiveCommunication.NCCL.value and
+              reduced_gv_list):
+            control_input_grads = [g for g, _ in reduced_gv_list[-1]]
+          else:
+            control_input_grads = None
 
+          collective_reduced = (
+              cross_device_utils.build_collective_gather_indexed_slices(
+                  grads, self._num_workers, self._collective_keys,
+                  communication_hint, control_input_grads))
           result = []
           for (_, v), g in zip(grad_and_vars, collective_reduced):
             result.append([g, v])
diff --git a/tensorflow/python/distribute/cross_device_ops_test.py b/tensorflow/python/distribute/cross_device_ops_test.py
index 17be5de236e..fe42f42ce2e 100644
--- a/tensorflow/python/distribute/cross_device_ops_test.py
+++ b/tensorflow/python/distribute/cross_device_ops_test.py
@@ -723,6 +723,8 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
           num_packs=[1, 2]))
   def testReductionDistributed(self, required_gpus, use_strategy_object,
                                num_packs):
+    if required_gpus == 2:
+      self.skipTest("b/138143527")
     self._run_between_graph_clients(
         self._test_reduction,
         self._cluster_spec,
@@ -749,6 +751,8 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
           required_gpus=2,
           use_strategy_object=[True, False]))
   def testReductionLocal(self, required_gpus, use_strategy_object):
+    if required_gpus == 2:
+      self.skipTest("b/138143527")
     self._test_reduction(
         None,
         None,
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
index 3afb8b55b24..0b88bdc9067 100644
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -25,12 +25,12 @@ from tensorflow.python.distribute import all_reduce
 from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.eager import def_function
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nccl_ops
 
@@ -304,6 +304,19 @@ class CollectiveKeys(object):
         self._group_key_table[key_id] = new_key
     return self._group_key_table[key_id]
 
+  def get_group_key_of_tensors(self, tensors):
+    """Returns a group key for set of tensors.
+
+    Args:
+      tensors: list of `Tensor`s in a collective group. Each tensor must be on a
+        different device.
+
+    Returns:
+      int key uniquely identifying the set of devices of these tensors.
+    """
+    devices = [t.device for t in tensors]
+    return self.get_group_key(devices)
+
   def get_op_instance_key(self):
     """Returns a new instance key for use in defining a collective op."""
     v = self._get_thread_local_object().op_instance_key
@@ -322,10 +335,12 @@ def build_collective_reduce(input_tensors,
                             collective_keys,
                             reduction_op='Add',
                             unary_op='Id',
-                            communication_hint='auto',
+                            communication_hint='AUTO',
                             control_inputs=None):
   """Build a subgraph that does one full all-reduce, using the collective Op.
 
+  This method must be called in graph mode or inside a tf.function.
+
   Args:
     input_tensors: tensors within a single worker graph that are to be reduced
       together; must be one per device.
@@ -346,37 +361,40 @@ def build_collective_reduce(input_tensors,
   Raises:
     ValueError: There must be at least two tensors over all the workers.
   """
+  assert not context.executing_eagerly(), (
+      'build_collective_reduce can only be called in graph mode or inside '
+      'tf.function')
+
   group_size = len(input_tensors) * num_workers
   if group_size < 2:
     return input_tensors
-  devices = [t.device for t in input_tensors]
-  num_devices = len(devices)
-  group_key = collective_keys.get_group_key(devices)
+  group_key = collective_keys.get_group_key_of_tensors(input_tensors)
   instance_key = collective_keys.get_op_instance_key()
   subdiv_offsets = [0]  # TODO(tucker): maybe support non-default subdiv spec
-  if control_inputs:
-    assert len(control_inputs) == len(input_tensors)
 
   out_tensors = []
-  for dev_idx in range(num_devices):
-    with ops.device(devices[dev_idx]):
-      if control_inputs:
-        assert control_inputs[dev_idx].device == input_tensors[dev_idx].device
-        with ops.control_dependencies([control_inputs[dev_idx]]):
-          reduce_op = collective_ops.all_reduce(
-              input_tensors[dev_idx], group_size, group_key, instance_key,
-              reduction_op, unary_op, subdiv_offsets, communication_hint)
-      else:
-        reduce_op = collective_ops.all_reduce(
-            input_tensors[dev_idx], group_size, group_key, instance_key,
-            reduction_op, unary_op, subdiv_offsets, communication_hint)
-      out_tensors.append(reduce_op)
+  for idx, input_tensor in enumerate(input_tensors):
+    with ops.device(input_tensor.device):
+      with ops.control_dependencies(
+          _control_input(input_tensors, control_inputs, idx)):
+        out_tensor = collective_ops.all_reduce(input_tensor, group_size,
+                                               group_key, instance_key,
+                                               reduction_op, unary_op,
+                                               subdiv_offsets,
+                                               communication_hint)
+      out_tensors.append(out_tensor)
   return out_tensors
 
 
-def build_collective_gather(input_tensors, num_workers, collective_keys):
+def build_collective_gather(input_tensors,
+                            num_workers,
+                            collective_keys,
+                            communication_hint='AUTO',
+                            control_inputs=None):
   """Build a subgraph that does one full all-gather, using the collective Op.
 
+  This method must be called in graph mode or inside a tf.function.
+
   Args:
     input_tensors: tensors within a single worker graph that are to be gathered
       together; must be one per device.
@@ -384,37 +402,136 @@ def build_collective_gather(input_tensors, num_workers, collective_keys):
       will be doing this same reduction.  The reduction will actually include
       the corresponding tensors at all these workers.
     collective_keys: a CollectiveKeys object.
+    communication_hint: string providing hint to runtime for choosing collective
+      implementation.
+    control_inputs: if not None, add control edges between control_inputs and
+      (index-wise) corresponding collective_gather tensors
 
   Returns:
     An array of final tensors, one per device, computed by the full gather.
-
-  Raises:
-    ValueError: There must be at least two tensors over all the workers.
   """
+  assert not context.executing_eagerly(), (
+      'build_collective_gather can only be called in graph mode or inside '
+      'tf.function')
+
   group_size = len(input_tensors) * num_workers
   if group_size < 2:
     return input_tensors
-  devices = [t.device for t in input_tensors]
-  num_devices = len(devices)
-  group_key = collective_keys.get_group_key(devices)
+  group_key = collective_keys.get_group_key_of_tensors(input_tensors)
   instance_key = collective_keys.get_op_instance_key()
 
-  def collective_all_gather():
-    """Call collective allgather."""
-    assert not context.executing_eagerly()
-    out_tensors = []
-    for d in range(num_devices):
-      with ops.device(devices[d]):
-        gather_op = collective_ops.all_gather(input_tensors[d], group_size,
-                                              group_key, instance_key)
-        out_tensors.append(gather_op)
-    return out_tensors
+  out_tensors = []
+  for idx, input_tensor in enumerate(input_tensors):
+    with ops.device(input_tensor.device):
+      with ops.control_dependencies(
+          _control_input(input_tensors, control_inputs, idx)):
+        out_tensor = collective_ops.all_gather(input_tensor, group_size,
+                                               group_key, instance_key,
+                                               communication_hint)
+      out_tensors.append(out_tensor)
+  return out_tensors
 
-  if context.executing_eagerly():
-    # Collective ops will block unless they are executed concurrently such as in
-    # a graph or a defun.
-    collective_all_gather = def_function.function(collective_all_gather)
-  return collective_all_gather()
+
+def build_collective_gather_indexed_slices(input_slices_list,
+                                           num_workers,
+                                           collective_keys,
+                                           communication_hint='AUTO',
+                                           control_inputs=None):
+  """Build a subgraph that all-gathers IndexedSlices using the collective Op.
+
+  This method must be called in graph mode or inside a tf.function.
+
+  Args:
+    input_slices_list: a list of IndexedSlices within a single worker graph that
+      are to be gathered together; must be one per device.
+    num_workers: total number of workers with identical independent graphs that
+      will be doing this same reduction.  The reduction will actually include
+      the corresponding tensors at all these workers.
+    collective_keys: a CollectiveKeys object.
+    communication_hint: string providing hint to runtime for choosing collective
+      implementation.
+    control_inputs: if not None, add control edges between control_inputs and
+      (index-wise) corresponding collective_reduce tensors
+
+  Returns:
+    An array of final IndexedSlices, one per device, computed by the full
+    gather.
+
+  Raises:
+    ValueError: if control_inputs is not None and doesn't match the length and
+      devices of inputs.
+  """
+  assert not context.executing_eagerly(), (
+      'build_collective_gather_indexed_slices can only be called in graph mode'
+      ' or inside tf.function')
+
+  group_size = len(input_slices_list) * num_workers
+  if group_size < 2:
+    return input_slices_list
+
+  group_key = collective_keys.get_group_key_of_tensors(input_slices_list)
+  gather_length_key = collective_keys.get_op_instance_key()
+  gather_indices_key = collective_keys.get_op_instance_key()
+  gather_values_key = collective_keys.get_op_instance_key()
+  reduce_densified_key = collective_keys.get_op_instance_key()
+
+  # Current CollectiveAllGather implementations require input IndexedSlices to
+  # have consistent length across the board, we handle the reduction of
+  # IndexedSlices as follows:
+  #   1. Gather the lengths of IndexedSlices from all participants.
+  #   2. If they have consistent length, apply all_gather.
+  #   3. Otherwise convert IndexedSlices to dense tensors and apply
+  #      all_reduce.
+  out_slices_list = []
+  for idx, input_slices in enumerate(input_slices_list):
+    # pylint: disable = cell-var-from-loop
+    with ops.device(input_slices.device):
+
+      def all_gather():
+        """Use all_gather to aggregate `IndexedSlices`."""
+        all_values = collective_ops.all_gather(input_slices.values, group_size,
+                                               group_key, gather_values_key,
+                                               communication_hint)
+        # Add control dependency to order the all-gather.
+        control = [all_values] if communication_hint == 'NCCL' else []
+        with ops.control_dependencies(control):
+          all_indices = collective_ops.all_gather(input_slices.indices,
+                                                  group_size, group_key,
+                                                  gather_indices_key,
+                                                  communication_hint)
+        return ops.IndexedSlices(
+            values=all_values,
+            indices=all_indices,
+            dense_shape=input_slices.dense_shape)
+
+      def densify_and_all_reduce():
+        """Use all_reduce to aggregate `IndexedSlices`."""
+        densified = ops.convert_to_tensor(input_slices)
+        reduced = collective_ops.all_reduce(densified, group_size, group_key,
+                                            reduce_densified_key, 'Add', 'Id',
+                                            [0], communication_hint)
+        # We have to convert dense grad to IndexedSlice because all_reduce()
+        # and all_gather() must have the same return type as required by
+        # control_flow_ops.cond.
+        return ops.IndexedSlices(
+            values=reduced,
+            indices=math_ops.range(array_ops.shape(reduced)[0]),
+            dense_shape=input_slices.dense_shape)
+
+      length = array_ops.shape(input_slices.indices)
+      with ops.control_dependencies(
+          _control_input(input_slices, control_inputs, idx)):
+        all_lengths = collective_ops.all_gather(length, group_size, group_key,
+                                                gather_length_key,
+                                                communication_hint)
+      out_slices = control_flow_ops.cond(
+          math_ops.equal(
+              math_ops.reduce_max(all_lengths),
+              math_ops.reduce_min(all_lengths)), all_gather,
+          densify_and_all_reduce)
+      out_slices_list.append(out_slices)
+    # pylint: enable=cell-var-from-loop
+  return out_slices_list
 
 
 def sum_grad_and_var_all_reduce(grad_and_vars,
@@ -777,3 +894,31 @@ def stitch_values(values_and_indices_list):
         assert result[i] is None
         result[i] = v
   return result
+
+
+def _control_input(inputs, control_inputs, idx):
+  """Returns the `idx`-th item in control_inputs to be used in ops.control_dependencies.
+
+  This is a helper function for building collective ops.  The function checks
+  that the devices of control_inputs and inputs match.
+
+  Args:
+    inputs: a list of `Tensor`s
+    control_inputs: a list or None.
+    idx: the index into `inputs` and `control_inputs`.
+
+  Returns:
+    A one item list of the `idx`-th element of `control_inputs`, or an empty
+    list if `control_inputs` is None.
+  """
+  if control_inputs is None:
+    return []
+  if len(control_inputs) != len(inputs):
+    raise ValueError(
+        'control_inputs must match the length of the inputs, %s != %s' %
+        (len(control_inputs), len(inputs)))
+  if control_inputs[idx].device != inputs[idx].device:
+    raise ValueError(
+        'control_inputs must match the device of the inputs, %s != %s' %
+        (control_inputs[idx].device, inputs[idx].device))
+  return control_inputs[idx]

From cafd3318ed414183526c2a484e5350cedef837a7 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Tue, 18 Feb 2020 23:46:29 -0800
Subject: [PATCH 218/442] Fix cross_device_ops_test with multiple GPUs

Collective ops needs to be launched on every device, so we can't evaluate values
on each replica separately.

PiperOrigin-RevId: 295905955
Change-Id: Ie326a1be8574ac1b6299ea05756a73cdd6d25904
---
 .../distribute/cross_device_ops_test.py       | 278 ++++++++----------
 1 file changed, 130 insertions(+), 148 deletions(-)

diff --git a/tensorflow/python/distribute/cross_device_ops_test.py b/tensorflow/python/distribute/cross_device_ops_test.py
index fe42f42ce2e..b60809fd3b5 100644
--- a/tensorflow/python/distribute/cross_device_ops_test.py
+++ b/tensorflow/python/distribute/cross_device_ops_test.py
@@ -111,143 +111,152 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
   def _assert_indexed_slices_equal(self, left, right):
     self.assertIsInstance(left, ops.IndexedSlices)
     self.assertIsInstance(right, ops.IndexedSlices)
-    self.assertEqual(device_util.resolve(left.device),
-                     device_util.resolve(right.device))
+    self.assertEqual(
+        device_util.resolve(left.device), device_util.resolve(right.device))
     self.assertAllEqual(
         self.evaluate(ops.convert_to_tensor(left)),
         self.evaluate(ops.convert_to_tensor(right)))
 
-  def _assert_values_equal(self, left, right):
-    self.assertEqual(type(left), type(right))
-    if isinstance(left, (list, tuple)):
-      for l, r in zip(left, right):
-        self._assert_values_equal(l, r)
-    else:
-      if isinstance(left, value_lib.DistributedValues):
-        self.assertEqual(set(left._devices), set(right._devices))
-        self._assert_values_equal(left.values, right.values)
+  def _assert_mirrored_equal(self, left_list, right_list, sess):
+    if not isinstance(left_list, list):
+      left_list, right_list = [left_list], [right_list]
+
+    for left, right in zip(left_list, right_list):
+      self.assertEqual(type(left), type(right))
+
+      # Convert Mirrored to a list since sess.run(Mirrored) only returns one
+      # value.
+      if isinstance(left, value_lib.Mirrored):
+        left, right = left.values, right.values
       else:
-        self.assertEqual(
-            device_util.resolve(left.device), device_util.resolve(right.device))
-        if isinstance(left, ops.IndexedSlices):
-          self._assert_indexed_slices_equal(left, right)
-        elif context.executing_eagerly():
-          self.assertEqual(left.numpy(), right.numpy())
-        else:
-          with self.cached_session() as sess:
-            self.assertEqual(sess.run(left), sess.run(right))
+        # When there's only one replica Mirrored is automatically unwrapped.
+        left, right = [left], [right]
+
+      for left_value, right_value in zip(left, right):
+        self.assertEqual(left_value.device, right_value.device)
+
+      # Densify IndexedSlices.
+      left = [ops.convert_to_tensor(v) for v in left]
+      right = [ops.convert_to_tensor(v) for v in right]
+      left, right = sess.run((left, right))
+      for left_value, right_value in zip(left, right):
+        self.assertAllEqual(left_value, right_value)
 
   def _testReductionAndBroadcast(self, cross_device_ops, devices):
     if context.num_gpus() < sum(1 for d in devices if "GPU" in d.upper()):
       self.skipTest("Not enough GPUs")
 
-    values = [constant_op.constant(float(d)) for d in range(len(devices))]
-    per_replica = _make_per_replica(values, devices)
-    mean = (len(devices) - 1.) / 2.
+    with self.cached_session() as sess:
+      values = [constant_op.constant(float(d)) for d in range(len(devices))]
+      per_replica = _make_per_replica(values, devices)
+      mean = (len(devices) - 1.) / 2.
 
-    values_2 = [constant_op.constant(d + 1.0) for d in range(len(devices))]
-    per_replica_2 = _make_per_replica(values_2, devices)
-    mean_2 = mean + 1.
+      values_2 = [constant_op.constant(d + 1.0) for d in range(len(devices))]
+      per_replica_2 = _make_per_replica(values_2, devices)
+      mean_2 = mean + 1.
 
-    destination_mirrored = _fake_mirrored(1., devices)
-    destination_different = _fake_mirrored(1., device_util.resolve(_cpu_device))
-    destination_str = device_util.resolve(_cpu_device)
+      destination_mirrored = _fake_mirrored(1., devices)
+      destination_different = _fake_mirrored(1.,
+                                             device_util.resolve(_cpu_device))
+      destination_str = device_util.resolve(_cpu_device)
 
-    all_destinations = [
-        destination_mirrored, destination_different, destination_str,
-    ]
+      all_destinations = [
+          destination_mirrored,
+          destination_different,
+          destination_str,
+      ]
 
-    # test reduce()
-    for destinations in all_destinations:
-      self._assert_values_equal(
-          cross_device_ops.reduce(
-              reduce_util.ReduceOp.MEAN,
-              per_replica,
-              destinations=destinations),
-          _fake_mirrored(mean, destinations))
-      self._assert_values_equal(
-          cross_device_ops.reduce(
-              reduce_util.ReduceOp.MEAN,
-              per_replica_2,
-              destinations=destinations),
-          _fake_mirrored(mean_2, destinations))
-      self._assert_values_equal(
-          cross_device_ops.reduce(
-              reduce_util.ReduceOp.SUM, per_replica,
-              destinations=destinations),
-          _fake_mirrored(mean * len(devices), destinations))
-      self._assert_values_equal(
-          cross_device_ops.reduce(
-              reduce_util.ReduceOp.SUM,
-              per_replica_2,
-              destinations=destinations),
-          _fake_mirrored(mean_2 * len(devices), destinations))
+      # test reduce()
+      for destinations in all_destinations:
+        self._assert_mirrored_equal(
+            cross_device_ops.reduce(
+                reduce_util.ReduceOp.MEAN,
+                per_replica,
+                destinations=destinations), _fake_mirrored(mean, destinations),
+            sess)
+        self._assert_mirrored_equal(
+            cross_device_ops.reduce(
+                reduce_util.ReduceOp.MEAN,
+                per_replica_2,
+                destinations=destinations),
+            _fake_mirrored(mean_2, destinations), sess)
+        self._assert_mirrored_equal(
+            cross_device_ops.reduce(
+                reduce_util.ReduceOp.SUM,
+                per_replica,
+                destinations=destinations),
+            _fake_mirrored(mean * len(devices), destinations), sess)
+        self._assert_mirrored_equal(
+            cross_device_ops.reduce(
+                reduce_util.ReduceOp.SUM,
+                per_replica_2,
+                destinations=destinations),
+            _fake_mirrored(mean_2 * len(devices), destinations), sess)
 
-    # test batch_reduce()
-    for d1, d2 in itertools.product(all_destinations, all_destinations):
-      self._assert_values_equal(
-          cross_device_ops.batch_reduce(
-              reduce_util.ReduceOp.MEAN,
-              [(per_replica, d1), (per_replica_2, d2)]),
-          [
-              _fake_mirrored(mean, d1),
-              _fake_mirrored(mean_2, d2)
-          ])
-      self._assert_values_equal(
-          cross_device_ops.batch_reduce(
-              reduce_util.ReduceOp.SUM,
-              [(per_replica, d1), (per_replica_2, d2)]),
-          [
-              _fake_mirrored(mean * len(devices), d1),
-              _fake_mirrored(mean_2 * len(devices), d2)
-          ])
+      # test batch_reduce()
+      for d1, d2 in itertools.product(all_destinations, all_destinations):
+        self._assert_mirrored_equal(
+            cross_device_ops.batch_reduce(reduce_util.ReduceOp.MEAN,
+                                          [(per_replica, d1),
+                                           (per_replica_2, d2)]),
+            [_fake_mirrored(mean, d1),
+             _fake_mirrored(mean_2, d2)], sess)
+        self._assert_mirrored_equal(
+            cross_device_ops.batch_reduce(reduce_util.ReduceOp.SUM,
+                                          [(per_replica, d1),
+                                           (per_replica_2, d2)]),
+            [
+                _fake_mirrored(mean * len(devices), d1),
+                _fake_mirrored(mean_2 * len(devices), d2)
+            ], sess)
 
-    # test broadcast()
-    for destinations in all_destinations:
-      self._assert_values_equal(
-          cross_device_ops.broadcast(constant_op.constant(1.), destinations),
-          _fake_mirrored(1., destinations))
+      # test broadcast()
+      for destinations in all_destinations:
+        self._assert_mirrored_equal(
+            cross_device_ops.broadcast(constant_op.constant(1.), destinations),
+            _fake_mirrored(1., destinations), sess)
 
   def _testIndexedSlicesAllReduce(self, devices, cross_device_ops_instance,
                                   reduce_op, batch_reduce):
-    dense_shape = [5, 2]
-    t0 = _make_indexed_slices([[1., 2.]], [1], dense_shape, devices[0])
-    t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], dense_shape,
-                              devices[1])
-    per_replica = value_lib.PerReplica((t0, t1))
+    with self.cached_session() as sess:
+      dense_shape = [5, 2]
+      t0 = _make_indexed_slices([[1., 2.]], [1], dense_shape, devices[0])
+      t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], dense_shape,
+                                devices[1])
+      per_replica = value_lib.PerReplica((t0, t1))
 
-    if batch_reduce:
-      result = cross_device_ops_instance.batch_reduce(
-          reduce_op, [(per_replica, per_replica)])
-    else:
-      result = cross_device_ops_instance.reduce(reduce_op, per_replica,
-                                                per_replica)
+      if batch_reduce:
+        result = cross_device_ops_instance.batch_reduce(
+            reduce_op, [(per_replica, per_replica)])
+      else:
+        result = cross_device_ops_instance.reduce(reduce_op, per_replica,
+                                                  per_replica)
 
-    total_indices_with_dups = [1, 1, 3]
-    total_indices_without_dups = [1, 3]
+      total_indices_with_dups = [1, 1, 3]
+      total_indices_without_dups = [1, 3]
 
-    if reduce_op == reduce_util.ReduceOp.SUM:
-      total_values_with_dups = [[1., 2.], [3., 4.], [5., 6.]]
-      total_values_without_dups = [[4., 6.], [5., 6.]]
-    else:
-      assert reduce_op == reduce_util.ReduceOp.MEAN
-      total_values_with_dups = [[0.5, 1.], [1.5, 2.], [2.5, 3.]]
-      total_values_without_dups = [[2., 3.], [2.5, 3.]]
+      if reduce_op == reduce_util.ReduceOp.SUM:
+        total_values_with_dups = [[1., 2.], [3., 4.], [5., 6.]]
+        total_values_without_dups = [[4., 6.], [5., 6.]]
+      else:
+        assert reduce_op == reduce_util.ReduceOp.MEAN
+        total_values_with_dups = [[0.5, 1.], [1.5, 2.], [2.5, 3.]]
+        total_values_without_dups = [[2., 3.], [2.5, 3.]]
 
-    total_mirrored_with_dups = _make_mirrored_indexed_slices(
-        devices, total_values_with_dups, total_indices_with_dups, dense_shape)
-    total_mirrored_without_dups = _make_mirrored_indexed_slices(
-        devices, total_values_without_dups, total_indices_without_dups,
-        dense_shape)
+      total_mirrored_with_dups = _make_mirrored_indexed_slices(
+          devices, total_values_with_dups, total_indices_with_dups, dense_shape)
+      total_mirrored_without_dups = _make_mirrored_indexed_slices(
+          devices, total_values_without_dups, total_indices_without_dups,
+          dense_shape)
 
-    # Test that the result is semantically equal to both the concatenated
-    # IndexedSlices, as well as when the duplicate indices are summed up.
-    if batch_reduce:
-      total_mirrored_with_dups = [total_mirrored_with_dups]
-      total_mirrored_without_dups = [total_mirrored_without_dups]
+      # Test that the result is semantically equal to both the concatenated
+      # IndexedSlices, as well as when the duplicate indices are summed up.
+      if batch_reduce:
+        total_mirrored_with_dups = [total_mirrored_with_dups]
+        total_mirrored_without_dups = [total_mirrored_without_dups]
 
-    self._assert_values_equal(total_mirrored_with_dups, result)
-    self._assert_values_equal(total_mirrored_without_dups, result)
+      self._assert_mirrored_equal(total_mirrored_with_dups, result, sess)
+      self._assert_mirrored_equal(total_mirrored_without_dups, result, sess)
 
 
 class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
@@ -434,7 +443,7 @@ NUM_WORKERS = 3
 
 
 class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
-                              parameterized.TestCase):
+                              CrossDeviceOpsTestBase):
 
   collective_key_base = 100000
 
@@ -505,29 +514,6 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
         return (collective_all_reduce_ops, devices,
                 "grpc://" + self._cluster_spec[task_type][task_id])
 
-  def _assert_values_equal(self, left, right, sess):
-    self.assertEqual(type(left), type(right))
-    if isinstance(left, (list, tuple)):
-      for l, r in zip(left, right):
-        self._assert_values_equal(l, r, sess)
-    else:
-      if isinstance(left, value_lib.DistributedValues):
-        self.assertEqual(set(left._devices), set(right._devices))
-        self._assert_values_equal(left.values, right.values, sess)
-      else:
-        self.assertEqual(
-            device_util.resolve(left.device), device_util.resolve(right.device))
-        if isinstance(left, ops.IndexedSlices):
-          self._assert_indexed_slices_equal(left, right)
-        elif context.executing_eagerly():
-          self.assertEqual(left.numpy(), right.numpy())
-        else:
-          run_options = config_pb2.RunOptions()
-          run_options.experimental.collective_graph_key = 6
-          self.assertEqual(
-              sess.run(left, options=run_options),
-              sess.run(right, options=run_options))
-
   def _test_reduction(self,
                       task_type,
                       task_id,
@@ -589,21 +575,21 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
 
       # test reduce()
       for destinations in all_destinations:
-        self._assert_values_equal(
+        self._assert_mirrored_equal(
             _reduce(
                 collective_all_reduce,
                 reduce_util.ReduceOp.MEAN,
                 per_replica,
                 destinations=destinations), _fake_mirrored(mean, destinations),
             sess)
-        self._assert_values_equal(
+        self._assert_mirrored_equal(
             _reduce(
                 collective_all_reduce,
                 reduce_util.ReduceOp.MEAN,
                 per_replica_2,
-                destinations=destinations), _fake_mirrored(
-                    mean_2, destinations), sess)
-        self._assert_values_equal(
+                destinations=destinations),
+            _fake_mirrored(mean_2, destinations), sess)
+        self._assert_mirrored_equal(
             _reduce(
                 collective_all_reduce,
                 reduce_util.ReduceOp.SUM,
@@ -611,7 +597,7 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
                 destinations=destinations),
             _fake_mirrored(mean * len(devices) * num_workers, destinations),
             sess)
-        self._assert_values_equal(
+        self._assert_mirrored_equal(
             _reduce(
                 collective_all_reduce,
                 reduce_util.ReduceOp.SUM,
@@ -622,12 +608,12 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
 
       # test batch_reduce()
       for d1, d2 in itertools.product(all_destinations, all_destinations):
-        self._assert_values_equal(
+        self._assert_mirrored_equal(
             _batch_reduce(collective_all_reduce, reduce_util.ReduceOp.MEAN,
                           [(per_replica, d1), (per_replica_2, d2)]),
             [_fake_mirrored(mean, d1),
              _fake_mirrored(mean_2, d2)], sess)
-        self._assert_values_equal(
+        self._assert_mirrored_equal(
             _batch_reduce(collective_all_reduce, reduce_util.ReduceOp.SUM,
                           [(per_replica, d1), (per_replica_2, d2)]),
             [
@@ -723,8 +709,6 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
           num_packs=[1, 2]))
   def testReductionDistributed(self, required_gpus, use_strategy_object,
                                num_packs):
-    if required_gpus == 2:
-      self.skipTest("b/138143527")
     self._run_between_graph_clients(
         self._test_reduction,
         self._cluster_spec,
@@ -751,8 +735,6 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
           required_gpus=2,
           use_strategy_object=[True, False]))
   def testReductionLocal(self, required_gpus, use_strategy_object):
-    if required_gpus == 2:
-      self.skipTest("b/138143527")
     self._test_reduction(
         None,
         None,

From 8a97955b8402d38aedc41bfa9d4a53622f9b276a Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Tue, 18 Feb 2020 23:57:55 -0800
Subject: [PATCH 219/442] [TF saved_model_cli AOT] Move
 xla_compiled_cpu_function header deps to the right place.

PiperOrigin-RevId: 295906983
Change-Id: I46b7480c9ac809940bd99a41bf124fde6f2ba3af
---
 tensorflow/tools/pip_package/BUILD | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index c50dea89482..062cff07f2a 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -27,6 +27,7 @@ transitive_hdrs(
     name = "included_headers",
     deps = [
         "//tensorflow/c/experimental:network",
+        "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -126,9 +127,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/tools/docs:generate_lib",
     "//tensorflow/tools/docs:parser",
     "//tensorflow/tools/docs:py_guide_parser",
-] + if_xla_available([
-    "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function",
-])
+]
 
 # On Windows, python binary is a zip file of runfiles tree.
 # Add everything to its data dependency for generating a runfiles tree

From 6d445432639dc88fa3cd7172a52d16b57f9b6dd3 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Wed, 19 Feb 2020 00:55:21 -0800
Subject: [PATCH 220/442] Run mlir_gpu tests from files.

This makes it easier to iterate over tests, because the test doesn't have to be
recompiled all the time.

PiperOrigin-RevId: 295913906
Change-Id: I975a8db086aceb862498f2d63138cb0bf4859c00
---
 .../compiler/xla/service/mlir_gpu/BUILD       |   2 +
 .../service/mlir_gpu/mlir_irgen_test_base.cc  |  64 ++-
 .../service/mlir_gpu/mlir_irgen_test_base.h   |  32 +-
 .../compiler/xla/service/mlir_gpu/tests/BUILD |  28 +
 .../xla/service/mlir_gpu/tests/abs.hlo        |   9 +
 .../xla/service/mlir_gpu/tests/add.hlo        |  11 +
 .../service/mlir_gpu/tests/add_as_kernel.hlo  |  62 +++
 .../mlir_gpu/tests/add_in_gpu_dialect.hlo     |  19 +
 .../service/mlir_gpu/tests/add_multiply.hlo   |  21 +
 .../mlir_gpu/tests/add_multiply_gpu.hlo       |  22 +
 .../xla/service/mlir_gpu/tests/add_reduce.hlo |  23 +
 .../xla/service/mlir_gpu/tests/broadcast.hlo  |  13 +
 .../xla/service/mlir_gpu/tests/broken_add.hlo |   9 +
 .../xla/service/mlir_gpu/tests/ceil.hlo       |   9 +
 .../xla/service/mlir_gpu/tests/compare.hlo    |  12 +
 .../xla/service/mlir_gpu/tests/const.hlo      |  11 +
 .../xla/service/mlir_gpu/tests/copy.hlo       |   9 +
 .../xla/service/mlir_gpu/tests/cos.hlo        |   9 +
 .../xla/service/mlir_gpu/tests/exp.hlo        |  11 +
 .../service/mlir_gpu/tests/fused_reduce.hlo   |  34 ++
 .../xla/service/mlir_gpu/tests/iota.hlo       |  10 +
 .../mlir_gpu/tests/iota_add_multiply.hlo      |  15 +
 .../xla/service/mlir_gpu/tests/log.hlo        |  10 +
 .../mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc  | 516 +++---------------
 .../xla/service/mlir_gpu/tests/neg.hlo        |   9 +
 .../xla/service/mlir_gpu/tests/rem.hlo        |  10 +
 .../xla/service/mlir_gpu/tests/rsqrt.hlo      |  10 +
 .../xla/service/mlir_gpu/tests/select.hlo     |  13 +
 .../xla/service/mlir_gpu/tests/sign.hlo       |   9 +
 .../xla/service/mlir_gpu/tests/tanh.hlo       |   9 +
 tensorflow/compiler/xla/tests/filecheck.cc    |  20 +-
 tensorflow/compiler/xla/tests/filecheck.h     |   9 +-
 32 files changed, 572 insertions(+), 478 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/abs.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/add.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/add_as_kernel.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/add_in_gpu_dialect.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply_gpu.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/add_reduce.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/broadcast.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/broken_add.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/ceil.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/compare.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/const.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/copy.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/cos.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/exp.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/fused_reduce.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/iota.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/iota_add_multiply.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/log.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/neg.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/rem.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/rsqrt.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/select.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/sign.hlo
 create mode 100644 tensorflow/compiler/xla/service/mlir_gpu/tests/tanh.hlo

diff --git a/tensorflow/compiler/xla/service/mlir_gpu/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
index 51be8d6fdb5..afceefdeae6 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
@@ -193,7 +193,9 @@ cc_library(
         "//tensorflow/compiler/xla/tests:codegen_test_base",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:verified_hlo_module",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:resource_loader",
         "//tensorflow/core/platform:test",
         "@com_google_absl//absl/memory",
         "@llvm-project//llvm:support",
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.cc
index dbc6efe9ec9..fa2167a4bd9 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.cc
@@ -32,6 +32,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/filecheck.h"
 #include "tensorflow/compiler/xla/tests/verified_hlo_module.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
@@ -46,8 +49,10 @@ void MlirIrGenTestBase::CompileIr(std::unique_ptr<HloModule> hlo_module,
   TF_ASSERT_OK(status);
 }
 
-void MlirIrGenTestBase::PatternMatch(const string& str, const string& pattern) {
-  StatusOr<bool> filecheck_result = RunFileCheck(str, pattern);
+void MlirIrGenTestBase::PatternMatch(const std::string& str,
+                                     const std::string& pattern_file) {
+  StatusOr<bool> filecheck_result =
+      RunFileCheckWithPatternFile(str, pattern_file);
   TF_ASSERT_OK(filecheck_result.status());
   EXPECT_TRUE(filecheck_result.ValueOrDie());
 }
@@ -55,7 +60,7 @@ void MlirIrGenTestBase::PatternMatch(const string& str, const string& pattern) {
 string MlirIrGenTestBase::CompileIr(
     std::unique_ptr<HloModule> hlo_module,
     MlirCompiler::IRHook::LoweringStage printing_stage) {
-  string ir;
+  std::string ir;
   CompileIr(std::move(hlo_module),
             {[&ir](mlir::ModuleOp module) -> Status {
                std::string buffer_string;
@@ -70,23 +75,21 @@ string MlirIrGenTestBase::CompileIr(
 }
 
 void MlirIrGenTestBase::CompileAndVerifyIr(
-    std::unique_ptr<HloModule> hlo_module, const string& pattern,
+    std::unique_ptr<HloModule> hlo_module, const std::string& pattern_file,
     LoweringStage printing_stage) {
-  string ir = CompileIr(std::move(hlo_module), printing_stage);
-  PatternMatch(ir, pattern);
+  std::string ir = CompileIr(std::move(hlo_module), printing_stage);
+  PatternMatch(ir, pattern_file);
 }
 
-void MlirIrGenTestBase::CompileAndVerifyIr(const string& hlo_text,
-                                           const string& expected_llvm_ir,
+void MlirIrGenTestBase::CompileAndVerifyIr(const std::string& hlo_text_filename,
                                            LoweringStage printing_stage) {
-  HloModuleConfig config;
-  config.set_debug_options(GetDebugOptionsForTest());
-  auto module = absl::make_unique<VerifiedHloModule>(
-      "Module", config, /*verifier_layout_sensitive=*/true,
-      /*allow_mixed_precision_in_hlo_verifier=*/false,
-      /*shape_size_function=*/ShapeUtil::ByteSizeOfElements);
-  TF_ASSERT_OK(module->ParseHloStringAndVerifyModule(hlo_text));
-  CompileAndVerifyIr(std::move(module), expected_llvm_ir, printing_stage);
+  std::string hlo_text_absolute_filename =
+      tensorflow::GetDataDependencyFilepath(hlo_text_filename);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          GetVerifiedHloModule(hlo_text_absolute_filename));
+  CompileAndVerifyIr(std::move(module),
+                     /*pattern_file=*/hlo_text_absolute_filename,
+                     printing_stage);
 }
 
 MlirCompiler::IRHook MlirIrGenTestBase::getIRHookBreakingLoweringStage(
@@ -104,7 +107,7 @@ MlirCompiler::IRHook MlirIrGenTestBase::getIRHookBreakingLoweringStage(
 
 StatusOr<string> MlirIrGenTestBase::CompileAndInjectErrors(
     std::unique_ptr<HloModule> hlo_module, LoweringStage breaking_stage) {
-  string errors;
+  std::string errors;
   auto error_handler = [&errors](const EmissionContext::ErrorMap& error_map,
                                  HloModule* hlo_module) {
     errors = "ERRORS FOUND: ";
@@ -127,19 +130,32 @@ StatusOr<string> MlirIrGenTestBase::CompileAndInjectErrors(
   return status;
 }
 
-void MlirIrGenTestBase::CompileAndVerifyErrors(const string& hlo_text,
-                                               const string& expected_errors,
-                                               LoweringStage breaking_stage) {
+void MlirIrGenTestBase::CompileAndVerifyErrors(
+    const std::string& hlo_text_filename, LoweringStage breaking_stage) {
+  std::string test_srcdir = tensorflow::testing::TensorFlowSrcRoot();
+  std::string hlo_text_absolute_filename =
+      tensorflow::GetDataDependencyFilepath(hlo_text_filename);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          GetVerifiedHloModule(hlo_text_absolute_filename));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::string errors,
+      CompileAndInjectErrors(std::move(module), breaking_stage));
+  PatternMatch(errors, /*pattern_file=*/hlo_text_absolute_filename);
+}
+
+StatusOr<std::unique_ptr<VerifiedHloModule>>
+MlirIrGenTestBase::GetVerifiedHloModule(const std::string& hlo_text_filename) {
   HloModuleConfig config;
   config.set_debug_options(GetDebugOptionsForTest());
   auto module = absl::make_unique<VerifiedHloModule>(
       "Module", config, /*verifier_layout_sensitive=*/true,
       /*allow_mixed_precision_in_hlo_verifier=*/false,
       /*shape_size_function=*/ShapeUtil::ByteSizeOfElements);
-  TF_ASSERT_OK(module->ParseHloStringAndVerifyModule(hlo_text));
-  TF_ASSERT_OK_AND_ASSIGN(
-      string errors, CompileAndInjectErrors(std::move(module), breaking_stage));
-  PatternMatch(errors, expected_errors);
+  std::string hlo_text;
+  TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(
+      tensorflow::Env::Default(), hlo_text_filename, &hlo_text));
+  TF_RETURN_IF_ERROR(module->ParseHloStringAndVerifyModule(hlo_text));
+  return std::move(module);
 }
 
 MlirCompiler* MlirIrGenTestBase::GetMLIRCompiler() {
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.h b/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.h
index a46b606d75e..46246c0d4d6 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.h
@@ -39,38 +39,36 @@ class MlirIrGenTestBase : public CodegenTestBase {
   // steps to LLVM IR are applied; otherwise, the IR before lowering is
   // matched.
   void CompileAndVerifyIr(std::unique_ptr<HloModule> hlo_module,
-                          const string& pattern, LoweringStage printing_stage);
+                          const std::string& pattern_file,
+                          LoweringStage printing_stage);
 
-  // A thin wrapper around CompileAndVerifyIr that parses `hlo_text` to create
-  // an HLO module.
-  void CompileAndVerifyIr(const string& hlo_text,
-                          const string& expected_llvm_ir,
+  // A thin wrapper around CompileAndVerifyIr that parses the hlo text in
+  // `hlo_text_filename` to create an HLO module.
+  void CompileAndVerifyIr(const std::string& hlo_text_filename,
                           LoweringStage printing_stage = LoweringStage::LHLO);
 
-  // Compiles and returns module with optimizations from a given HLO.
-  StatusOr<std::unique_ptr<HloModule>> GetOptimizedModule(
-      absl::string_view hlo);
-
   // Adds the InjectErrorsForTestingPass to MLIRCompiler on the provided
-  // lowering stage, compiles the given HLO module, and returns a string
+  // lowering stage, compiles the given HLO module, and returns a std::string
   // representation of all the errors occurred during compiling.
   StatusOr<string> CompileAndInjectErrors(std::unique_ptr<HloModule> hlo_module,
                                           LoweringStage breaking_stage);
 
   // Adds the InjectErrorsForTestingPass to MLIRCompiler on the provided
   // lowering stage, parses and compiles `hlo_text`, and verifies that the
-  // string representation of all the errors occurred during compiling matches
-  // the given pattern.
-  void CompileAndVerifyErrors(const string& hlo_text,
-                              const string& expected_errors,
+  // std::string representation of all the errors occurred during compiling
+  // matches the given pattern.
+  void CompileAndVerifyErrors(const std::string& hlo_text_filename,
                               LoweringStage breaking_stage);
 
  private:
+  StatusOr<std::unique_ptr<VerifiedHloModule>> GetVerifiedHloModule(
+      const std::string& hlo_text_filename);
+
   void CompileIr(std::unique_ptr<HloModule> hlo_module,
                  const MlirCompiler::IRHook& ir_hook);
-  void PatternMatch(const string& str, const string& pattern);
-  string CompileIr(std::unique_ptr<HloModule> hlo_module,
-                   LoweringStage printing_stage);
+  void PatternMatch(const std::string& str, const std::string& pattern_file);
+  std::string CompileIr(std::unique_ptr<HloModule> hlo_module,
+                        LoweringStage printing_stage);
   MlirCompiler::IRHook getIRHookBreakingLoweringStage(
       LoweringStage breaking_stage);
   MlirCompiler* GetMLIRCompiler();
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
index 05429224f6a..aeaaf0b16c4 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
@@ -25,11 +25,39 @@ package_group(
 tf_cc_test(
     name = "mlir_gpu_lhlo_gen_test",
     srcs = if_cuda_is_configured(["mlir_gpu_lhlo_gen_test.cc"]),
+    data = [
+        "abs.hlo",
+        "add.hlo",
+        "add_as_kernel.hlo",
+        "add_in_gpu_dialect.hlo",
+        "add_multiply.hlo",
+        "add_multiply_gpu.hlo",
+        "add_reduce.hlo",
+        "broadcast.hlo",
+        "broken_add.hlo",
+        "ceil.hlo",
+        "compare.hlo",
+        "const.hlo",
+        "copy.hlo",
+        "cos.hlo",
+        "exp.hlo",
+        "fused_reduce.hlo",
+        "iota.hlo",
+        "iota_add_multiply.hlo",
+        "log.hlo",
+        "neg.hlo",
+        "rem.hlo",
+        "rsqrt.hlo",
+        "select.hlo",
+        "sign.hlo",
+        "tanh.hlo",
+    ],
     tags = tf_cuda_tests_tags() + ["no_rocm"],
     deps = [
         "//tensorflow/core:test_main",
         "//tensorflow/core:test",
     ] + if_cuda_is_configured([
+        "//tensorflow/core:lib",
         "//tensorflow/compiler/xla/service:gpu_plugin_mlir",
         "//tensorflow/compiler/xla/service/mlir_gpu:mlir_irgen_test_base",
         "//tensorflow/stream_executor/lib",
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/abs.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/abs.hlo
new file mode 100644
index 00000000000..6a4353d8d45
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/abs.hlo
@@ -0,0 +1,9 @@
+HloModule Abs
+ENTRY %Abs (val: f32[2,2]) -> f32[2,2] {
+  %val = f32[2,2]{1,0} parameter(0)
+  ROOT %abs = f32[2,2]{1,0} abs(f32[2,2]{1,0} %val)
+}
+
+//  CHECK: func @abs(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
+//  CHECK:   "xla_lhlo.abs"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
+//  CHECK: }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/add.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/add.hlo
new file mode 100644
index 00000000000..d48fcf89658
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/add.hlo
@@ -0,0 +1,11 @@
+HloModule Add
+
+ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
+  %x = f32[2,2]{1,0} parameter(0)
+  %y = f32[2,2]{1,0} parameter(1)
+  ROOT %add = f32[2,2]{1,0} add(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
+}
+
+// CHECK: func @add(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]]) {
+// CHECK:   "xla_lhlo.add"(%[[ARG0]], %[[ARG1]], %[[ARG2]]) : ([[TYPE]], [[TYPE]], [[TYPE]]) -> ()
+// CHECK: }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_as_kernel.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_as_kernel.hlo
new file mode 100644
index 00000000000..c477cc99c39
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_as_kernel.hlo
@@ -0,0 +1,62 @@
+HloModule Add
+
+ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
+  %x = f32[2,2]{1,0} parameter(0)
+  %y = f32[2,2]{1,0} parameter(1)
+  ROOT %add = f32[2,2]{1,0} add(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
+}
+
+//  CHECK: func @add_kernel(%[[ARG0:.*]]: [[TYPE:!llvm<.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]]
+
+//
+//   Check that relevant sizes and strides are emitted.
+//
+//  CHECK: %[[CAST0:.*]] = llvm.bitcast %[[ARG0:.*]] : !llvm<"i8*"> to !llvm<"float*">
+//  CHECK: %[[SIZE00:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
+//  CHECK: %[[SIZE01:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
+//  CHECK: %[[STRIDE01:.*]] = llvm.mlir.constant(1 : i64) : !llvm.i64
+//  CHECK: %[[STRIDE00:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
+
+//  CHECK: %[[CAST1:.*]] = llvm.bitcast %[[ARG1:.*]] : !llvm<"i8*"> to !llvm<"float*">
+//  CHECK: %[[SIZE10:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
+//  CHECK: %[[SIZE11:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
+//  CHECK: %[[STRIDE11:.*]] = llvm.mlir.constant(1 : i64) : !llvm.i64
+//  CHECK: %[[STRIDE10:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
+
+//  CHECK: %[[CAST2:.*]] = llvm.bitcast %[[ARG2:.*]] : !llvm<"i8*"> to !llvm<"float*">
+//  CHECK: %[[SIZE20:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
+//  CHECK: %[[SIZE21:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
+//  CHECK: %[[STRIDE21:.*]] = llvm.mlir.constant(1 : i64) : !llvm.i64
+//  CHECK: %[[STRIDE20:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
+
+//
+//   Check that the emitted sizes and strides, as well the pointers to HLO buffers,
+//   are inserted into the memref descriptors.
+//
+//  CHECK: %[[DESC0:.*]] = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %[[DESC01:.*]] = llvm.insertvalue %[[CAST0]], %[[DESC0]][0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %[[DESC02:.*]] = llvm.insertvalue %[[CAST0]], %[[DESC01]][1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %[[DESC03:.*]] = llvm.insertvalue %{{.*}}, %[[DESC02]][2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %[[DESC04:.*]] = llvm.insertvalue %[[SIZE00]], %[[DESC03]][3, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %[[DESC05:.*]] = llvm.insertvalue %[[STRIDE00]], %[[DESC04]][4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %[[DESC06:.*]] = llvm.insertvalue %[[SIZE01]], %[[DESC05]][3, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %{{.*}} = llvm.insertvalue %[[STRIDE01]], %[[DESC06]][4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+
+//  CHECK: %[[DESC1:.*]] = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %[[DESC11:.*]] = llvm.insertvalue %[[CAST1]], %[[DESC1]][0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %[[DESC12:.*]] = llvm.insertvalue %[[CAST1]], %[[DESC11]][1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %[[DESC13:.*]] = llvm.insertvalue %{{.*}}, %[[DESC12]][2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %[[DESC14:.*]] = llvm.insertvalue %[[SIZE10]], %[[DESC13]][3, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %[[DESC15:.*]] = llvm.insertvalue %[[STRIDE10]], %[[DESC14]][4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %[[DESC16:.*]] = llvm.insertvalue %[[SIZE11]], %[[DESC15]][3, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %{{.*}} = llvm.insertvalue %[[STRIDE11]], %[[DESC16]][4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+
+//  CHECK: %[[DESC2:.*]] = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %[[DESC21:.*]] = llvm.insertvalue %[[CAST2]], %[[DESC2]][0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %[[DESC22:.*]] = llvm.insertvalue %[[CAST2]], %[[DESC21]][1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %[[DESC23:.*]] = llvm.insertvalue %{{.*}}, %[[DESC22]][2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %[[DESC24:.*]] = llvm.insertvalue %[[SIZE20]], %[[DESC23]][3, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %[[DESC25:.*]] = llvm.insertvalue %[[STRIDE20]], %[[DESC24]][4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %[[DESC26:.*]] = llvm.insertvalue %[[SIZE21]], %[[DESC25]][3, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+//  CHECK: %{{.*}} = llvm.insertvalue %[[STRIDE21]], %[[DESC26]][4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
+
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_in_gpu_dialect.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_in_gpu_dialect.hlo
new file mode 100644
index 00000000000..ec7df87af64
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_in_gpu_dialect.hlo
@@ -0,0 +1,19 @@
+HloModule Add
+
+ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
+  %x = f32[2,2]{1,0} parameter(0)
+  %y = f32[2,2]{1,0} parameter(1)
+  ROOT %add = f32[2,2]{1,0} add(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
+}
+
+// CHECK: func @add(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]]) {
+// CHECK: "gpu.launch_func"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[ARG0]], %[[ARG1]], %[[ARG2]]
+// CHECK: }
+// CHECK: func @add_kernel(%[[ARG0]]: [[TYPE]], %[[ARG1]]: [[TYPE]], %[[ARG2]]: [[TYPE]]
+// CHECK-DAG: std.subview %[[ARG0]]{{\[}}[[INDEX:.*]]]
+// CHECK-DAG: std.subview %[[ARG1]]{{\[}}[[INDEX]]]
+// CHECK-DAG: std.subview %[[ARG2]]{{\[}}[[INDEX]]]
+// CHECK: %[[VAL1:.*]] = load %{{.*\[}}[[INDEX:.*]]]
+// CHECK: %[[VAL2:.*]] = load %{{.*\[}}[[INDEX]]]
+// CHECK: %[[RES:.*]] = addf %[[VAL1]], %[[VAL2]]
+// CHECK: store %[[RES]], %{{.*\[}}[[INDEX]]]
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply.hlo
new file mode 100644
index 00000000000..f4f2e4d2c91
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply.hlo
@@ -0,0 +1,21 @@
+HloModule AddMultiply
+
+ENTRY %AddMultiply (x: f32[2,2], y: f32[2,2], z: f32[2,2]) -> f32[2,2] {
+  %x = f32[2,2]{1,0} parameter(0)
+  %y = f32[2,2]{1,0} parameter(1)
+  %z = f32[2,2]{1,0} parameter(2)
+  %add = f32[2,2]{1,0} add(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
+  ROOT %mul = f32[2,2]{1,0} multiply(f32[2,2]{1,0} %add, f32[2,2]{1,0} %z)
+}
+
+//  CHECK: func @fusion(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]], %[[RESULT:.*]]: [[TYPE]])
+//  CHECK: "xla_lhlo.fusion"() ( {
+//  CHECK:   %[[REF0:.*]] = tensor_load %[[ARG0]] : [[TYPE]]
+//  CHECK:   %[[REF1:.*]] = tensor_load %[[ARG1]] : [[TYPE]]
+//  CHECK:   %[[REF2:.*]] = tensor_load %[[ARG2]] : [[TYPE]]
+//  CHECK:   %[[ADD:.*]] = xla_hlo.add %[[REF1]], %[[REF2]]
+//  CHECK:   %[[MUL:.*]] = xla_hlo.mul %[[ADD]], %[[REF0]]
+//  CHECK:   tensor_store %[[MUL]], %[[RESULT]]
+//  CHECK:   "xla_lhlo.terminator"()
+//  CHECK-NEXT: }
+
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply_gpu.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply_gpu.hlo
new file mode 100644
index 00000000000..e9000956c23
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_multiply_gpu.hlo
@@ -0,0 +1,22 @@
+HloModule AddMultiply
+
+ENTRY %AddMultiply (x: f32[2,2], y: f32[2,2], z: f32[2,2]) -> f32[2,2] {
+  %x = f32[2,2]{1,0} parameter(0)
+  %y = f32[2,2]{1,0} parameter(1)
+  %z = f32[2,2]{1,0} parameter(2)
+  %add = f32[2,2]{1,0} add(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
+  ROOT %mul = f32[2,2]{1,0} multiply(f32[2,2]{1,0} %add, f32[2,2]{1,0} %z)
+}
+
+//  CHECK: func @fusion_kernel(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]], %[[RESULT:.*]]: [[TYPE]])
+//  CHECK-DAG: std.subview %[[ARG0]]{{\[}}[[INDEX:.*]]]
+//  CHECK-DAG: std.subview %[[ARG1]]{{\[}}[[INDEX]]]
+//  CHECK-DAG: std.subview %[[ARG2]]{{\[}}[[INDEX]]]
+//  CHECK-DAG: std.subview %[[RESULT]]{{\[}}[[INDEX]]]
+//  CHECK:   %[[V0:.*]] = load %{{.*\[}}[[CSTIDX:.*]]]
+//  CHECK:   %[[V1:.*]] = load %{{.*\[}}[[CSTIDX:.*]]]
+//  CHECK:   %[[ADD:.*]] = addf %[[V0]], %[[V1]]
+//  CHECK:   %[[V2:.*]] = load %{{.*\[}}[[CSTIDX:.*]]]
+//  CHECK:   %[[MUL:.*]] = mulf %[[ADD]], %[[V2]]
+//  CHECK:   store %[[MUL]], %{{.*\[}}[[CSTIDX:.*]]]
+//  CHECK-NEXT: return
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/add_reduce.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_reduce.hlo
new file mode 100644
index 00000000000..6df8f284b72
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/add_reduce.hlo
@@ -0,0 +1,23 @@
+HloModule AddReduce
+
+%add (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %x, f32[] %y)
+}
+
+ENTRY %AddReduce (x: f32[100,10], c: f32[]) -> f32[100] {
+  %x = f32[100,10]{1,0} parameter(0)
+  %c = f32[] parameter(1)
+  ROOT %reduce = f32[100]{0} reduce(f32[100,10]{1,0} %x, f32[] %c), dimensions={1}, to_apply=%add
+}
+
+//  CHECK: func @reduce(%[[ARG:.*]]: [[ARGT:.*]], %[[CST:.*]]: memref<f32>, %[[RES:.*]]: [[REST:.*]]) {
+//  CHECK:   "xla_lhlo.reduce"(%[[ARG]], %[[CST]], %[[RES]]) ( {
+//  CHECK:   ^bb0(%[[FARG0:.*]]: memref<f32>, %[[FARG1:.*]]: memref<f32>, %[[FRES:.*]]: memref<f32>):
+//  CHECK:      %[[LHS:.*]] = tensor_load %[[FARG0]] : memref<f32>
+//  CHECK:      %[[RHS:.*]] = tensor_load %[[FARG1]] : memref<f32>
+//  CHECK:      %[[RES:.*]] = xla_hlo.add %[[LHS]], %[[RHS]] : tensor<f32>
+//  CHECK:      tensor_store %[[RES]], %[[FRES]] : memref<f32>
+//  CHECK:     "xla_lhlo.terminator"() : () -> ()
+//  CHECK-NEXT: }) {dimensions = dense<1> : tensor<1xi64>} : ([[ARGT]], memref<f32>, [[REST]]) -> ()
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/broadcast.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/broadcast.hlo
new file mode 100644
index 00000000000..b0613ac96ac
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/broadcast.hlo
@@ -0,0 +1,13 @@
+HloModule Broadcast
+
+ENTRY %Broadcast (x: f32[10]) -> f32[10, 5] {
+  %x = f32[10]{0} parameter(0)
+  ROOT %broadcast = f32[10, 5]{1,0} broadcast(f32[10]{0} %x), dimensions={0}
+}
+
+//  CHECK: func @broadcast(%[[IN:.*]]: [[IN_T:.*]],  %[[OUT:.*]]: [[OUT_T:.*]]) {
+//  CHECK:   "xla_lhlo.broadcast_in_dim"(%[[IN]], %[[OUT]])
+//  CHECK:   {broadcast_dimensions = dense<0> : tensor<1xi64>}
+//  CHECK:   : ([[IN_T]], [[OUT_T]]) -> ()
+//  CHECK: }
+
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/broken_add.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/broken_add.hlo
new file mode 100644
index 00000000000..b4b22f42f29
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/broken_add.hlo
@@ -0,0 +1,9 @@
+HloModule Add
+
+ENTRY %Add (x: f32[2,2,2], y: f32[2,2,2]) -> f32[2,2,2] {
+  %x = f32[2,2,2]{2,1,0} parameter(0)
+  %y = f32[2,2,2]{2,1,0} parameter(1)
+  ROOT %add = f32[2,2,2]{2,1,0} add(f32[2,2,2]{2,1,0} %x, f32[2,2,2]{2,1,0} %y)
+}
+
+// CHECK: ERRORS FOUND: [%add = f32[2,2,2]{2,1,0} add(f32[2,2,2]{2,1,0} %x, f32[2,2,2]{2,1,0} %y): failed for testing: xla_lhlo.add; failed for testing: std.return]
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/ceil.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/ceil.hlo
new file mode 100644
index 00000000000..ff4e8191da4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/ceil.hlo
@@ -0,0 +1,9 @@
+HloModule Ceil
+ENTRY %Ceil (val: f32[2,2]) -> f32[2,2] {
+  %val = f32[2,2]{1,0} parameter(0)
+  ROOT %ceil = f32[2,2]{1,0} ceil(f32[2,2]{1,0} %val)
+}
+
+//  CHECK: func @ceil(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
+//  CHECK:   "xla_lhlo.ceil"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
+//  CHECK: }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/compare.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/compare.hlo
new file mode 100644
index 00000000000..a0f88efbd2f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/compare.hlo
@@ -0,0 +1,12 @@
+HloModule Compare
+
+ENTRY %Compare (x: f32[2,2], y: f32[2,2]) -> pred[2,2] {
+  %x = f32[2,2]{1,0} parameter(0)
+  %y = f32[2,2]{1,0} parameter(1)
+  ROOT %compare = pred[2,2]{1,0} compare(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y), direction=EQ
+}
+
+// CHECK: func @compare(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[PRED:.*]]: [[PRED_TYPE:.*]]) {
+// CHECK:   "xla_lhlo.compare"(%[[ARG0]], %[[ARG1]], %[[PRED]])
+// CHECK: {comparison_direction = "EQ"} : ([[TYPE]], [[TYPE]], [[PRED_TYPE]]) -> ()
+// CHECK: }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/const.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/const.hlo
new file mode 100644
index 00000000000..9c28b3619ac
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/const.hlo
@@ -0,0 +1,11 @@
+HloModule Const
+
+ENTRY %Const () -> s32[100] {
+  %const.0 = s32[] constant(10)
+  ROOT %broadcast.0 = s32[100]{0} broadcast(s32[] %const.0), dimensions={}
+}
+
+// CHECK: func @constant(%[[ARG0:.*]]: memref<i32>)
+// CHECK:   "xla_lhlo.constant"(%[[ARG0]]) {value = dense<10> : tensor<i32>}
+// CHECK: func @broadcast(%[[ARG1:.*]]: memref<i32>, %[[ARG2:.*]]: memref<100xi32>)
+// CHECK:   "xla_lhlo.broadcast_in_dim"(%[[ARG1]], %[[ARG2]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>}
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/copy.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/copy.hlo
new file mode 100644
index 00000000000..a729a4375b6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/copy.hlo
@@ -0,0 +1,9 @@
+HloModule Copy
+
+ENTRY %Copy (x: f32[2,4]) -> f32[2,4] {
+  %x = f32[2,4] parameter(0)
+  ROOT %copy = f32[2,4] copy(f32[2,4] %x)
+}
+
+// CHECK: func @copy(%[[OPERAND:.*]]: memref<2x4xf32>, %[[RESULT:.*]]: memref<2x4xf32>) {
+// CHECK:   "xla_lhlo.copy"(%[[OPERAND]], %[[RESULT]]) : (memref<2x4xf32>, memref<2x4xf32>) -> ()
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/cos.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/cos.hlo
new file mode 100644
index 00000000000..9abc2dad0aa
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/cos.hlo
@@ -0,0 +1,9 @@
+HloModule Cos
+ENTRY %Cos (val: f32[2,2]) -> f32[2,2] {
+  %val = f32[2,2]{1,0} parameter(0)
+  ROOT %cos = f32[2,2]{1,0} cosine(f32[2,2]{1,0} %val)
+}
+
+//  CHECK: func @cosine(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
+//  CHECK:   "xla_lhlo.cos"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
+//  CHECK: }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/exp.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/exp.hlo
new file mode 100644
index 00000000000..9af0de99d42
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/exp.hlo
@@ -0,0 +1,11 @@
+HloModule Exp
+
+ENTRY %Exp (x: f32[2,2]) -> f32[2,2] {
+  %x = f32[2,2]{1,0} parameter(0)
+  ROOT %exp = f32[2,2]{1,0} exponential(f32[2,2]{1,0} %x)
+}
+
+// CHECK: func @exponential(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
+// CHECK:   "xla_lhlo.exp"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
+// CHECK: }
+
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/fused_reduce.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/fused_reduce.hlo
new file mode 100644
index 00000000000..a673469977f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/fused_reduce.hlo
@@ -0,0 +1,34 @@
+HloModule FusedReduce
+
+%add (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %x, f32[] %y)
+}
+
+%fused_computation (param: f32[100,10]) -> f32[10] {
+  %param = f32[100,10] parameter(0)
+  %constant = f32[] constant(0)
+  ROOT %reduce = f32[10]{0} reduce(f32[100,10]{1,0} %param, f32[] %constant),
+      dimensions={0}, to_apply=%add
+}
+
+ENTRY %FusedReduce (x: f32[100,10]) -> f32[10] {
+  %x = f32[100,10] parameter(0)
+  ROOT %fusion = f32[10]{0} fusion(f32[100,10]{1,0} %x), kind=kInput,
+      calls=%fused_computation
+}
+
+//  CHECK: func @fusion(%[[ARG0:.*]]: [[TYPE:.*]], %[[RESULT:.*]]: [[RTYPE:.*]])
+//  CHECK: "xla_lhlo.fusion"() ( {
+//  CHECK:   %[[REF0:.*]] = tensor_load %arg0 : [[TYPE]]
+//  CHECK:   %[[CT0:.*]] = xla_hlo.constant dense<0.000000e+00>
+//  CHECK:   %[[RED:.*]] = "xla_hlo.reduce"(%0, %1) ( {
+//  CHECK:     ^bb0(%[[BARG0:.*]]: [[ETYPE:.*]], %[[BARG1:.*]]: [[ETYPE]])
+//  CHECK:       %[[ADD:.*]] = xla_hlo.add %[[BARG0]], %[[BARG1]] : [[ETYPE]]
+//  CHECK:       "xla_hlo.return"(%[[ADD]])
+//  CHECK:     })
+//  CHECK:   tensor_store %[[RED]], %[[RESULT]] : [[RTYPE]]
+//  CHECK:   "xla_lhlo.terminator"()
+//  CHECK-NEXT: })
+
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/iota.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/iota.hlo
new file mode 100644
index 00000000000..d622ed0e528
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/iota.hlo
@@ -0,0 +1,10 @@
+HloModule Iota
+
+ ENTRY %Iota() -> s64[10, 5] {
+  ROOT %iota = s64[10, 5]{1,0} iota(), iota_dimension=0
+}
+
+//  CHECK: func @iota(%[[OUT:.*]]: [[OUT_T:.*]]) {
+//  CHECK:   "xla_lhlo.iota"(%[[OUT]])
+//  CHECK:   {iota_dimension = 0 : i64} : ([[OUT_T]]) -> ()
+//  CHECK: }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/iota_add_multiply.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/iota_add_multiply.hlo
new file mode 100644
index 00000000000..89b7a43a102
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/iota_add_multiply.hlo
@@ -0,0 +1,15 @@
+HloModule AddMultiply
+
+ENTRY %AddMultiply (x: s32[2,2], y: s32[2,2]) -> s32[2,2] {
+  %x = s32[2,2]{1,0} parameter(0)
+  %y = s32[2,2]{1,0} parameter(1)
+
+  %add = s32[2,2]{1,0} add(s32[2,2]{1,0} %x, s32[2,2]{1,0} %y)
+  %iota = s32[2, 2]{1,0} iota(), iota_dimension=0
+
+  ROOT %mul = s32[2,2]{1,0} multiply(s32[2,2]{1,0} %add, s32[2,2]{1,0} %iota)
+}
+
+//  CHECK-NOT:  store
+//  CHECK:      %[[RESULT:.*]] = muli %{{.*}}, %{{.*}}
+//  CHECK:      store %[[RESULT]]
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/log.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/log.hlo
new file mode 100644
index 00000000000..c7e2574558a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/log.hlo
@@ -0,0 +1,10 @@
+HloModule Log
+
+ENTRY %Log (x: f32[2,2]) -> f32[2,2] {
+  %x = f32[2,2]{1,0} parameter(0)
+  ROOT %log = f32[2,2]{1,0} log(f32[2,2]{1,0} %x)
+}
+
+// CHECK: func @log(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
+// CHECK:   "xla_lhlo.log"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
+// CHECK: }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc b/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc
index 9a23ff8748e..7afb7e9281d 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/mlir_gpu/mlir_irgen_test_base.h"
+#include "tensorflow/core/platform/path.h"
 
 namespace xla {
 namespace mlir_gpu {
@@ -21,513 +22,174 @@ namespace mlir_gpu {
 class LhloGenTest : public MlirIrGenTestBase {};
 
 TEST_F(LhloGenTest, Const) {
-  CompileAndVerifyIr(R"(
-HloModule Const
-
-ENTRY %Const () -> s32[100] {
-  %const.0 = s32[] constant(10)
-  ROOT %broadcast.0 = s32[100]{0} broadcast(s32[] %const.0), dimensions={}
-})",
-                     R"(
-;CHECK: func @constant(%[[ARG0:.*]]: memref<i32>)
-;CHECK:   "xla_lhlo.constant"(%[[ARG0]]) {value = dense<10> : tensor<i32>}
-;CHECK: func @broadcast(%[[ARG1:.*]]: memref<i32>, %[[ARG2:.*]]: memref<100xi32>)
-;CHECK:   "xla_lhlo.broadcast_in_dim"(%[[ARG1]], %[[ARG2]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>}
-)",
-                     LoweringStage::LHLO);
+  CompileAndVerifyIr(
+      /*hlo_text_filename=*/tensorflow::io::JoinPath(
+          "tensorflow", "compiler", "xla", "service", "mlir_gpu", "tests",
+          "const.hlo"),
+      LoweringStage::LHLO);
 }
 
 TEST_F(LhloGenTest, BrokenAdd) {
   CompileAndVerifyErrors(
-      R"(
-HloModule Add
-
-ENTRY %Add (x: f32[2,2,2], y: f32[2,2,2]) -> f32[2,2,2] {
-  %x = f32[2,2,2]{2,1,0} parameter(0)
-  %y = f32[2,2,2]{2,1,0} parameter(1)
-  ROOT %add = f32[2,2,2]{2,1,0} add(f32[2,2,2]{2,1,0} %x, f32[2,2,2]{2,1,0} %y)
-})",
-      R"(CHECK: ERRORS FOUND: [%add = f32[2,2,2]{2,1,0} add(f32[2,2,2]{2,1,0} %x, f32[2,2,2]{2,1,0} %y): failed for testing: xla_lhlo.add; failed for testing: std.return])",
+      /*hlo_text_filename=*/
+      tensorflow::io::JoinPath("tensorflow", "compiler", "xla", "service",
+                               "mlir_gpu", "tests", "broken_add.hlo"),
       LoweringStage::LHLO);
 }
 
 TEST_F(LhloGenTest, Add) {
-  CompileAndVerifyIr(R"(
-HloModule Add
-
-ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
-  %x = f32[2,2]{1,0} parameter(0)
-  %y = f32[2,2]{1,0} parameter(1)
-  ROOT %add = f32[2,2]{1,0} add(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
-})",
-                     R"(
-;CHECK: func @add(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]]) {
-;CHECK:   "xla_lhlo.add"(%[[ARG0]], %[[ARG1]], %[[ARG2]]) : ([[TYPE]], [[TYPE]], [[TYPE]]) -> ()
-;CHECK: }
-      )");
+  CompileAndVerifyIr(
+      /*hlo_text_filename=*/tensorflow::io::JoinPath(
+          "tensorflow", "compiler", "xla", "service", "mlir_gpu", "tests",
+          "add.hlo"));
 }
 
 TEST_F(LhloGenTest, Compare) {
-  CompileAndVerifyIr(R"(
-HloModule Compare
-
-ENTRY %Compare (x: f32[2,2], y: f32[2,2]) -> pred[2,2] {
-  %x = f32[2,2]{1,0} parameter(0)
-  %y = f32[2,2]{1,0} parameter(1)
-  ROOT %compare = pred[2,2]{1,0} compare(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y), direction=EQ
-})",
-                     R"(
-;CHECK: func @compare(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[PRED:.*]]: [[PRED_TYPE:.*]]) {
-;CHECK:   "xla_lhlo.compare"(%[[ARG0]], %[[ARG1]], %[[PRED]])
-;CHECK: {comparison_direction = "EQ"} : ([[TYPE]], [[TYPE]], [[PRED_TYPE]]) -> ()
-;CHECK: }
-)");
+  CompileAndVerifyIr(
+      /*hlo_text_filename=*/tensorflow::io::JoinPath(
+          "tensorflow", "compiler", "xla", "service", "mlir_gpu", "tests",
+          "compare.hlo"));
 }
 
 TEST_F(LhloGenTest, Copy) {
-  CompileAndVerifyIr(R"(
-HloModule Copy
-
-ENTRY %Copy (x: f32[2,4]) -> f32[2,4] {
-  %x = f32[2,4] parameter(0)
-  ROOT %copy = f32[2,4] copy(f32[2,4] %x)
-})",
-                     R"(
-;CHECK: func @copy(%[[OPERAND:.*]]: memref<2x4xf32>, %[[RESULT:.*]]: memref<2x4xf32>) {
-;CHECK:   "xla_lhlo.copy"(%[[OPERAND]], %[[RESULT]]) : (memref<2x4xf32>, memref<2x4xf32>) -> ()
-      )");
+  CompileAndVerifyIr(
+      /*hlo_text_filename=*/tensorflow::io::JoinPath(
+          "tensorflow", "compiler", "xla", "service", "mlir_gpu", "tests",
+          "copy.hlo"));
 }
 
 TEST_F(LhloGenTest, Select) {
-  CompileAndVerifyIr(R"(
-HloModule Select
-
-ENTRY %Select (p: pred[2,2], x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
-  %p = pred[2,2]{1,0} parameter(0)
-  %x = f32[2,2]{1,0} parameter(1)
-  %y = f32[2,2]{1,0} parameter(2)
-  ROOT %select = f32[2,2]{1,0} select(pred[2,2]{1,0} %p, f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
-})",
-                     R"(
-;CHECK: func @select(%[[PRED:.*]]: [[PRED_TYPE:.*]], %[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]]) {
-;CHECK:   "xla_lhlo.select"(%[[PRED]], %[[ARG0]], %[[ARG1]], %[[ARG2]]) : ([[PRED_TYPE]], [[TYPE]], [[TYPE]], [[TYPE]]) -> ()
-;CHECK: }
-      )");
+  CompileAndVerifyIr(
+      /*hlo_text_filename=*/tensorflow::io::JoinPath(
+          "tensorflow", "compiler", "xla", "service", "mlir_gpu", "tests",
+          "select.hlo"));
 }
 
 TEST_F(LhloGenTest, Exp) {
-  CompileAndVerifyIr(R"(
-HloModule Exp
-
-ENTRY %Exp (x: f32[2,2]) -> f32[2,2] {
-  %x = f32[2,2]{1,0} parameter(0)
-  ROOT %exp = f32[2,2]{1,0} exponential(f32[2,2]{1,0} %x)
-})",
-                     R"(
-;CHECK: func @exponential(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
-;CHECK:   "xla_lhlo.exp"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
-;CHECK: }
-      )");
+  CompileAndVerifyIr(
+      /*hlo_text_filename=*/tensorflow::io::JoinPath(
+          "tensorflow", "compiler", "xla", "service", "mlir_gpu", "tests",
+          "exp.hlo"));
 }
 
 TEST_F(LhloGenTest, Log) {
-  CompileAndVerifyIr(R"(
-HloModule Log
-
-ENTRY %Log (x: f32[2,2]) -> f32[2,2] {
-  %x = f32[2,2]{1,0} parameter(0)
-  ROOT %log = f32[2,2]{1,0} log(f32[2,2]{1,0} %x)
-})",
-                     R"(
-;CHECK: func @log(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
-;CHECK:   "xla_lhlo.log"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
-;CHECK: }
-      )");
+  CompileAndVerifyIr(
+      /*hlo_text_filename=*/tensorflow::io::JoinPath(
+          "tensorflow", "compiler", "xla", "service", "mlir_gpu", "tests",
+          "log.hlo"));
 }
 
 TEST_F(LhloGenTest, AddInGPUDialect) {
-  CompileAndVerifyIr(R"(
-HloModule Add
-
-ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
-  %x = f32[2,2]{1,0} parameter(0)
-  %y = f32[2,2]{1,0} parameter(1)
-  ROOT %add = f32[2,2]{1,0} add(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
-})",
-                     R"(
-;CHECK: func @add(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]]) {
-;CHECK: "gpu.launch_func"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[ARG0]], %[[ARG1]], %[[ARG2]]
-;CHECK: }
-;CHECK: func @add_kernel(%[[ARG0]]: [[TYPE]], %[[ARG1]]: [[TYPE]], %[[ARG2]]: [[TYPE]]
-;CHECK-DAG: std.subview %[[ARG0]]{{\[}}[[INDEX:.*]]]
-;CHECK-DAG: std.subview %[[ARG1]]{{\[}}[[INDEX]]]
-;CHECK-DAG: std.subview %[[ARG2]]{{\[}}[[INDEX]]]
-;CHECK: %[[VAL1:.*]] = load %{{.*\[}}[[INDEX:.*]]]
-;CHECK: %[[VAL2:.*]] = load %{{.*\[}}[[INDEX]]]
-;CHECK: %[[RES:.*]] = addf %[[VAL1]], %[[VAL2]]
-;CHECK: store %[[RES]], %{{.*\[}}[[INDEX]]]
-      )",
-                     LoweringStage::GPU);
+  CompileAndVerifyIr(
+      /*hlo_text_filename=*/
+      tensorflow::io::JoinPath("tensorflow", "compiler", "xla", "service",
+                               "mlir_gpu", "tests", "add_in_gpu_dialect.hlo"),
+      LoweringStage::GPU);
 }
 
 // This test verifies that the kernel signature is amended correctly. The actual
 // body of the generated function does not matter, it is already checked at the
 // GPU level above.
 TEST_F(LhloGenTest, AddAsKernel) {
-  CompileAndVerifyIr(R"(
-HloModule Add
-
-ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
-  %x = f32[2,2]{1,0} parameter(0)
-  %y = f32[2,2]{1,0} parameter(1)
-  ROOT %add = f32[2,2]{1,0} add(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
-})",
-                     R"(
-;CHECK: func @add_kernel(%[[ARG0:.*]]: [[TYPE:!llvm<.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]]
-
-;
-; Check that relevant sizes and strides are emitted.
-;
-;CHECK: %[[CAST0:.*]] = llvm.bitcast %[[ARG0:.*]] : !llvm<"i8*"> to !llvm<"float*">
-;CHECK: %[[SIZE00:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
-;CHECK: %[[SIZE01:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
-;CHECK: %[[STRIDE01:.*]] = llvm.mlir.constant(1 : i64) : !llvm.i64
-;CHECK: %[[STRIDE00:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
-
-;CHECK: %[[CAST1:.*]] = llvm.bitcast %[[ARG1:.*]] : !llvm<"i8*"> to !llvm<"float*">
-;CHECK: %[[SIZE10:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
-;CHECK: %[[SIZE11:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
-;CHECK: %[[STRIDE11:.*]] = llvm.mlir.constant(1 : i64) : !llvm.i64
-;CHECK: %[[STRIDE10:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
-
-;CHECK: %[[CAST2:.*]] = llvm.bitcast %[[ARG2:.*]] : !llvm<"i8*"> to !llvm<"float*">
-;CHECK: %[[SIZE20:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
-;CHECK: %[[SIZE21:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
-;CHECK: %[[STRIDE21:.*]] = llvm.mlir.constant(1 : i64) : !llvm.i64
-;CHECK: %[[STRIDE20:.*]] = llvm.mlir.constant(2 : i64) : !llvm.i64
-
-;
-; Check that the emitted sizes and strides, as well the pointers to HLO buffers,
-; are inserted into the memref descriptors.
-;
-;CHECK: %[[DESC0:.*]] = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-;CHECK: %[[DESC01:.*]] = llvm.insertvalue %[[CAST0]], %[[DESC0]][0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-;CHECK: %[[DESC02:.*]] = llvm.insertvalue %[[CAST0]], %[[DESC01]][1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-;CHECK: %[[DESC03:.*]] = llvm.insertvalue %{{.*}}, %[[DESC02]][2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-;CHECK: %[[DESC04:.*]] = llvm.insertvalue %[[SIZE00]], %[[DESC03]][3, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-;CHECK: %[[DESC05:.*]] = llvm.insertvalue %[[STRIDE00]], %[[DESC04]][4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-;CHECK: %[[DESC06:.*]] = llvm.insertvalue %[[SIZE01]], %[[DESC05]][3, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-;CHECK: %{{.*}} = llvm.insertvalue %[[STRIDE01]], %[[DESC06]][4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-
-;CHECK: %[[DESC1:.*]] = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-;CHECK: %[[DESC11:.*]] = llvm.insertvalue %[[CAST1]], %[[DESC1]][0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-;CHECK: %[[DESC12:.*]] = llvm.insertvalue %[[CAST1]], %[[DESC11]][1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-;CHECK: %[[DESC13:.*]] = llvm.insertvalue %{{.*}}, %[[DESC12]][2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-;CHECK: %[[DESC14:.*]] = llvm.insertvalue %[[SIZE10]], %[[DESC13]][3, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-;CHECK: %[[DESC15:.*]] = llvm.insertvalue %[[STRIDE10]], %[[DESC14]][4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-;CHECK: %[[DESC16:.*]] = llvm.insertvalue %[[SIZE11]], %[[DESC15]][3, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-;CHECK: %{{.*}} = llvm.insertvalue %[[STRIDE11]], %[[DESC16]][4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-
-;CHECK: %[[DESC2:.*]] = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-;CHECK: %[[DESC21:.*]] = llvm.insertvalue %[[CAST2]], %[[DESC2]][0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-;CHECK: %[[DESC22:.*]] = llvm.insertvalue %[[CAST2]], %[[DESC21]][1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-;CHECK: %[[DESC23:.*]] = llvm.insertvalue %{{.*}}, %[[DESC22]][2] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-;CHECK: %[[DESC24:.*]] = llvm.insertvalue %[[SIZE20]], %[[DESC23]][3, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-;CHECK: %[[DESC25:.*]] = llvm.insertvalue %[[STRIDE20]], %[[DESC24]][4, 0] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-;CHECK: %[[DESC26:.*]] = llvm.insertvalue %[[SIZE21]], %[[DESC25]][3, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-;CHECK: %{{.*}} = llvm.insertvalue %[[STRIDE21]], %[[DESC26]][4, 1] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }">
-      )",
-                     LoweringStage::KERNEL);
+  CompileAndVerifyIr(
+      tensorflow::io::JoinPath("tensorflow", "compiler", "xla", "service",
+                               "mlir_gpu", "tests", "add_as_kernel.hlo"),
+      LoweringStage::KERNEL);
 }
 
 // TODO(b/149302060) Reenable once fusion is fixed.
 TEST_F(LhloGenTest, DISABLED_AddMultiply) {
-  CompileAndVerifyIr(R"(
-HloModule AddMultiply
-
-ENTRY %AddMultiply (x: f32[2,2], y: f32[2,2], z: f32[2,2]) -> f32[2,2] {
-  %x = f32[2,2]{1,0} parameter(0)
-  %y = f32[2,2]{1,0} parameter(1)
-  %z = f32[2,2]{1,0} parameter(2)
-  %add = f32[2,2]{1,0} add(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
-  ROOT %mul = f32[2,2]{1,0} multiply(f32[2,2]{1,0} %add, f32[2,2]{1,0} %z)
-})",
-                     R"(
-;CHECK: func @fusion(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]], %[[RESULT:.*]]: [[TYPE]])
-;CHECK: "xla_lhlo.fusion"() ( {
-;CHECK:   %[[REF0:.*]] = tensor_load %[[ARG0]] : [[TYPE]]
-;CHECK:   %[[REF1:.*]] = tensor_load %[[ARG1]] : [[TYPE]]
-;CHECK:   %[[REF2:.*]] = tensor_load %[[ARG2]] : [[TYPE]]
-;CHECK:   %[[ADD:.*]] = xla_hlo.add %[[REF1]], %[[REF2]]
-;CHECK:   %[[MUL:.*]] = xla_hlo.mul %[[ADD]], %[[REF0]]
-;CHECK:   tensor_store %[[MUL]], %[[RESULT]]
-;CHECK:   "xla_lhlo.terminator"()
-;CHECK-NEXT: }
-      )");
+  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
+                                              "service", "mlir_gpu", "tests",
+                                              "add_multiply.hlo"));
 }
 
 // TODO(b/149302060) Reenable once fusion is fixed.
 TEST_F(LhloGenTest, DISABLED_IotaAddMultiply) {
-  CompileAndVerifyIr(R"(
-HloModule AddMultiply
-
-ENTRY %AddMultiply (x: s32[2,2], y: s32[2,2]) -> s32[2,2] {
-  %x = s32[2,2]{1,0} parameter(0)
-  %y = s32[2,2]{1,0} parameter(1)
-
-  %add = s32[2,2]{1,0} add(s32[2,2]{1,0} %x, s32[2,2]{1,0} %y)
-  %iota = s32[2, 2]{1,0} iota(), iota_dimension=0
-
-  ROOT %mul = s32[2,2]{1,0} multiply(s32[2,2]{1,0} %add, s32[2,2]{1,0} %iota)
-})",
-                     R"(
-;CHECK-NOT:  store
-;CHECK:      %[[RESULT:.*]] = muli %{{.*}}, %{{.*}}
-;CHECK:      store %[[RESULT]]
-)",
-                     LoweringStage::GPU);
+  CompileAndVerifyIr(
+      tensorflow::io::JoinPath("tensorflow", "compiler", "xla", "service",
+                               "mlir_gpu", "tests", "iota_add_multiply.hlo"),
+      LoweringStage::GPU);
 }
 
 TEST_F(LhloGenTest, AddMultiplyGPU) {
-  CompileAndVerifyIr(R"(
-HloModule AddMultiply
-
-ENTRY %AddMultiply (x: f32[2,2], y: f32[2,2], z: f32[2,2]) -> f32[2,2] {
-  %x = f32[2,2]{1,0} parameter(0)
-  %y = f32[2,2]{1,0} parameter(1)
-  %z = f32[2,2]{1,0} parameter(2)
-  %add = f32[2,2]{1,0} add(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
-  ROOT %mul = f32[2,2]{1,0} multiply(f32[2,2]{1,0} %add, f32[2,2]{1,0} %z)
-})",
-                     R"(
-;CHECK: func @fusion_kernel(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]], %[[RESULT:.*]]: [[TYPE]])
-;CHECK-DAG: std.subview %[[ARG0]]{{\[}}[[INDEX:.*]]]
-;CHECK-DAG: std.subview %[[ARG1]]{{\[}}[[INDEX]]]
-;CHECK-DAG: std.subview %[[ARG2]]{{\[}}[[INDEX]]]
-;CHECK-DAG: std.subview %[[RESULT]]{{\[}}[[INDEX]]]
-;CHECK:   %[[V0:.*]] = load %{{.*\[}}[[CSTIDX:.*]]]
-;CHECK:   %[[V1:.*]] = load %{{.*\[}}[[CSTIDX:.*]]]
-;CHECK:   %[[ADD:.*]] = addf %[[V0]], %[[V1]]
-;CHECK:   %[[V2:.*]] = load %{{.*\[}}[[CSTIDX:.*]]]
-;CHECK:   %[[MUL:.*]] = mulf %[[ADD]], %[[V2]]
-;CHECK:   store %[[MUL]], %{{.*\[}}[[CSTIDX:.*]]]
-;CHECK-NEXT: return
-      )",
-                     LoweringStage::GPU);
+  CompileAndVerifyIr(
+      tensorflow::io::JoinPath("tensorflow", "compiler", "xla", "service",
+                               "mlir_gpu", "tests", "add_multiply_gpu.hlo"),
+      LoweringStage::GPU);
 }
 
 // TODO(b/137624192): Reenable once we can fuse reductions.
 TEST_F(LhloGenTest, DISABLED_FusedReduce) {
-  CompileAndVerifyIr(R"(
-HloModule FusedReduce
-
-%add (x: f32[], y: f32[]) -> f32[] {
-  %x = f32[] parameter(0)
-  %y = f32[] parameter(1)
-  ROOT %add = f32[] add(f32[] %x, f32[] %y)
-}
-
-%fused_computation (param: f32[100,10]) -> f32[10] {
-  %param = f32[100,10] parameter(0)
-  %constant = f32[] constant(0)
-  ROOT %reduce = f32[10]{0} reduce(f32[100,10]{1,0} %param, f32[] %constant),
-      dimensions={0}, to_apply=%add
-}
-
-ENTRY %FusedReduce (x: f32[100,10]) -> f32[10] {
-  %x = f32[100,10] parameter(0)
-  ROOT %fusion = f32[10]{0} fusion(f32[100,10]{1,0} %x), kind=kInput,
-      calls=%fused_computation
-}
-)",
-                     R"(
-;CHECK: func @fusion(%[[ARG0:.*]]: [[TYPE:.*]], %[[RESULT:.*]]: [[RTYPE:.*]])
-;CHECK: "xla_lhlo.fusion"() ( {
-;CHECK:   %[[REF0:.*]] = tensor_load %arg0 : [[TYPE]]
-;CHECK:   %[[CT0:.*]] = xla_hlo.constant dense<0.000000e+00>
-;CHECK:   %[[RED:.*]] = "xla_hlo.reduce"(%0, %1) ( {
-;CHECK:     ^bb0(%[[BARG0:.*]]: [[ETYPE:.*]], %[[BARG1:.*]]: [[ETYPE]])
-;CHECK:       %[[ADD:.*]] = xla_hlo.add %[[BARG0]], %[[BARG1]] : [[ETYPE]]
-;CHECK:       "xla_hlo.return"(%[[ADD]])
-;CHECK:     })
-;CHECK:   tensor_store %[[RED]], %[[RESULT]] : [[RTYPE]]
-;CHECK:   "xla_lhlo.terminator"()
-;CHECK-NEXT: })
-      )");
+  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
+                                              "service", "mlir_gpu", "tests",
+                                              "fused_reduce.hlo"));
 }
 
 TEST_F(LhloGenTest, Broadcast) {
-  CompileAndVerifyIr(R"(
-HloModule Broadcast
-
-ENTRY %Broadcast (x: f32[10]) -> f32[10, 5] {
-  %x = f32[10]{0} parameter(0)
-  ROOT %broadcast = f32[10, 5]{1,0} broadcast(f32[10]{0} %x), dimensions={0}
-})",
-                     R"(
-;CHECK: func @broadcast(%[[IN:.*]]: [[IN_T:.*]],  %[[OUT:.*]]: [[OUT_T:.*]]) {
-;CHECK:   "xla_lhlo.broadcast_in_dim"(%[[IN]], %[[OUT]])
-;CHECK:   {broadcast_dimensions = dense<0> : tensor<1xi64>}
-;CHECK:   : ([[IN_T]], [[OUT_T]]) -> ()
-;CHECK: }
-)");
+  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
+                                              "service", "mlir_gpu", "tests",
+                                              "broadcast.hlo"));
 }
 
 TEST_F(LhloGenTest, Iota) {
-  CompileAndVerifyIr(R"(
- HloModule Iota
-
- ENTRY %Iota() -> s64[10, 5] {
-  ROOT %iota = s64[10, 5]{1,0} iota(), iota_dimension=0
-})",
-                     R"(
-;CHECK: func @iota(%[[OUT:.*]]: [[OUT_T:.*]]) {
-;CHECK:   "xla_lhlo.iota"(%[[OUT]])
-;CHECK:   {iota_dimension = 0 : i64} : ([[OUT_T]]) -> ()
-;CHECK: }
-)");
+  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
+                                              "service", "mlir_gpu", "tests",
+                                              "iota.hlo"));
 }
 
 TEST_F(LhloGenTest, AddReduce) {
-  CompileAndVerifyIr(R"(
-HloModule AddReduce
-
-%add (x: f32[], y: f32[]) -> f32[] {
-  %x = f32[] parameter(0)
-  %y = f32[] parameter(1)
-  ROOT %add = f32[] add(f32[] %x, f32[] %y)
-}
-
-ENTRY %AddReduce (x: f32[100,10], c: f32[]) -> f32[100] {
-  %x = f32[100,10]{1,0} parameter(0)
-  %c = f32[] parameter(1)
-  ROOT %reduce = f32[100]{0} reduce(f32[100,10]{1,0} %x, f32[] %c), dimensions={1}, to_apply=%add
-})",
-                     R"(
-;CHECK: func @reduce(%[[ARG:.*]]: [[ARGT:.*]], %[[CST:.*]]: memref<f32>, %[[RES:.*]]: [[REST:.*]]) {
-;CHECK:   "xla_lhlo.reduce"(%[[ARG]], %[[CST]], %[[RES]]) ( {
-;CHECK:   ^bb0(%[[FARG0:.*]]: memref<f32>, %[[FARG1:.*]]: memref<f32>, %[[FRES:.*]]: memref<f32>):
-;CHECK:      %[[LHS:.*]] = tensor_load %[[FARG0]] : memref<f32>
-;CHECK:      %[[RHS:.*]] = tensor_load %[[FARG1]] : memref<f32>
-;CHECK:      %[[RES:.*]] = xla_hlo.add %[[LHS]], %[[RHS]] : tensor<f32>
-;CHECK:      tensor_store %[[RES]], %[[FRES]] : memref<f32>
-;CHECK:     "xla_lhlo.terminator"() : () -> ()
-;CHECK-NEXT: }) {dimensions = dense<1> : tensor<1xi64>} : ([[ARGT]], memref<f32>, [[REST]]) -> ()
-      )");
+  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
+                                              "service", "mlir_gpu", "tests",
+                                              "add_reduce.hlo"));
 }
 
 TEST_F(LhloGenTest, Abs) {
-  CompileAndVerifyIr(R"(
-HloModule Abs
-ENTRY %Abs (val: f32[2,2]) -> f32[2,2] {
-  %val = f32[2,2]{1,0} parameter(0)
-  ROOT %abs = f32[2,2]{1,0} abs(f32[2,2]{1,0} %val)
-})",
-                     R"(
-;CHECK: func @abs(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
-;CHECK:   "xla_lhlo.abs"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
-;CHECK: }
-      )");
+  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
+                                              "service", "mlir_gpu", "tests",
+                                              "abs.hlo"));
 }
 
 TEST_F(LhloGenTest, Ceil) {
-  CompileAndVerifyIr(R"(
-HloModule Ceil
-ENTRY %Ceil (val: f32[2,2]) -> f32[2,2] {
-  %val = f32[2,2]{1,0} parameter(0)
-  ROOT %ceil = f32[2,2]{1,0} ceil(f32[2,2]{1,0} %val)
-})",
-                     R"(
-;CHECK: func @ceil(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
-;CHECK:   "xla_lhlo.ceil"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
-;CHECK: }
-      )");
+  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
+                                              "service", "mlir_gpu", "tests",
+                                              "ceil.hlo"));
 }
 
 TEST_F(LhloGenTest, Cos) {
-  CompileAndVerifyIr(R"(
-HloModule Cos
-ENTRY %Cos (val: f32[2,2]) -> f32[2,2] {
-  %val = f32[2,2]{1,0} parameter(0)
-  ROOT %cos = f32[2,2]{1,0} cosine(f32[2,2]{1,0} %val)
-})",
-                     R"(
-;CHECK: func @cosine(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
-;CHECK:   "xla_lhlo.cos"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
-;CHECK: }
-      )");
+  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
+                                              "service", "mlir_gpu", "tests",
+                                              "cos.hlo"));
 }
 
 TEST_F(LhloGenTest, Neg) {
-  CompileAndVerifyIr(R"(
-HloModule Neg
-ENTRY %Neg (val: f32[2,2]) -> f32[2,2] {
-  %val = f32[2,2]{1,0} parameter(0)
-  ROOT %neg = f32[2,2]{1,0} negate(f32[2,2]{1,0} %val)
-})",
-                     R"(
-;CHECK: func @negate(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
-;CHECK:   "xla_lhlo.neg"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
-;CHECK: }
-      )");
+  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
+                                              "service", "mlir_gpu", "tests",
+                                              "neg.hlo"));
 }
 
 TEST_F(LhloGenTest, Rem) {
-  CompileAndVerifyIr(R"(
-HloModule Rem
-ENTRY %Rem(x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
-  %x = f32[2,2]{1,0} parameter(0)
-  %y = f32[2,2]{1,0} parameter(1)
-  ROOT %rem = f32[2,2]{1,0} remainder(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
-})",
-                     R"(
-;CHECK: func @remainder(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]]) {
-;CHECK:   "xla_lhlo.remainder"(%[[ARG0]], %[[ARG1]], %[[ARG2]]) : ([[TYPE]], [[TYPE]], [[TYPE]]) -> ()
-;CHECK: }
-      )");
+  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
+                                              "service", "mlir_gpu", "tests",
+                                              "rem.hlo"));
 }
 
 TEST_F(LhloGenTest, Rsqrt) {
-  CompileAndVerifyIr(R"(
-HloModule Rsqrt
-
-ENTRY %Rsqrt (x: f32[2,2]) -> f32[2,2] {
-  %x = f32[2,2]{1,0} parameter(0)
-  ROOT %rsqrt = f32[2,2]{1,0} rsqrt(f32[2,2]{1,0} %x)
-})",
-                     R"(
-;CHECK: func @rsqrt(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
-;CHECK:   "xla_lhlo.rsqrt"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
-;CHECK: }
-      )");
+  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
+                                              "service", "mlir_gpu", "tests",
+                                              "rsqrt.hlo"));
 }
 
 TEST_F(LhloGenTest, Sign) {
-  CompileAndVerifyIr(R"(
-HloModule Sign
-ENTRY %Sign (val: f32[2,2]) -> f32[2,2] {
-  %val = f32[2,2]{1,0} parameter(0)
-  ROOT %sign = f32[2,2]{1,0} sign(f32[2,2]{1,0} %val)
-})",
-                     R"(
-;CHECK: func @sign(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
-;CHECK:   "xla_lhlo.sign"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
-;CHECK: }
-      )");
+  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
+                                              "service", "mlir_gpu", "tests",
+                                              "rsqrt.hlo"));
 }
 
 TEST_F(LhloGenTest, Tanh) {
-  CompileAndVerifyIr(R"(
-HloModule Tanh
-ENTRY %Tanh (val: f32[2,2]) -> f32[2,2] {
-  %val = f32[2,2]{1,0} parameter(0)
-  ROOT %tanh = f32[2,2]{1,0} tanh(f32[2,2]{1,0} %val)
-})",
-                     R"(
-;CHECK: func @tanh(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
-;CHECK:   "xla_lhlo.tanh"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
-;CHECK: }
-      )");
+  CompileAndVerifyIr(tensorflow::io::JoinPath("tensorflow", "compiler", "xla",
+                                              "service", "mlir_gpu", "tests",
+                                              "tanh.hlo"));
 }
 
 }  // namespace mlir_gpu
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/neg.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/neg.hlo
new file mode 100644
index 00000000000..caead37c995
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/neg.hlo
@@ -0,0 +1,9 @@
+HloModule Neg
+ENTRY %Neg (val: f32[2,2]) -> f32[2,2] {
+  %val = f32[2,2]{1,0} parameter(0)
+  ROOT %neg = f32[2,2]{1,0} negate(f32[2,2]{1,0} %val)
+}
+
+//  CHECK: func @negate(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
+//  CHECK:   "xla_lhlo.neg"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
+//  CHECK: }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/rem.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/rem.hlo
new file mode 100644
index 00000000000..441ace6ef94
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/rem.hlo
@@ -0,0 +1,10 @@
+HloModule Rem
+ENTRY %Rem(x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
+  %x = f32[2,2]{1,0} parameter(0)
+  %y = f32[2,2]{1,0} parameter(1)
+  ROOT %rem = f32[2,2]{1,0} remainder(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
+}
+
+//  CHECK: func @remainder(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]]) {
+//  CHECK:   "xla_lhlo.remainder"(%[[ARG0]], %[[ARG1]], %[[ARG2]]) : ([[TYPE]], [[TYPE]], [[TYPE]]) -> ()
+//  CHECK: }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/rsqrt.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/rsqrt.hlo
new file mode 100644
index 00000000000..a10f9ada92b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/rsqrt.hlo
@@ -0,0 +1,10 @@
+HloModule Rsqrt
+
+ENTRY %Rsqrt (x: f32[2,2]) -> f32[2,2] {
+  %x = f32[2,2]{1,0} parameter(0)
+  ROOT %rsqrt = f32[2,2]{1,0} rsqrt(f32[2,2]{1,0} %x)
+}
+
+//  CHECK: func @rsqrt(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
+//  CHECK:   "xla_lhlo.rsqrt"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
+//  CHECK: }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/select.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/select.hlo
new file mode 100644
index 00000000000..0cbe8c73700
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/select.hlo
@@ -0,0 +1,13 @@
+HloModule Select
+
+ENTRY %Select (p: pred[2,2], x: f32[2,2], y: f32[2,2]) -> f32[2,2] {
+  %p = pred[2,2]{1,0} parameter(0)
+  %x = f32[2,2]{1,0} parameter(1)
+  %y = f32[2,2]{1,0} parameter(2)
+  ROOT %select = f32[2,2]{1,0} select(pred[2,2]{1,0} %p, f32[2,2]{1,0} %x, f32[2,2]{1,0} %y)
+}
+
+// CHECK: func @select(%[[PRED:.*]]: [[PRED_TYPE:.*]], %[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]]) {
+// CHECK:   "xla_lhlo.select"(%[[PRED]], %[[ARG0]], %[[ARG1]], %[[ARG2]]) : ([[PRED_TYPE]], [[TYPE]], [[TYPE]], [[TYPE]]) -> ()
+// CHECK: }
+
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/sign.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/sign.hlo
new file mode 100644
index 00000000000..a0ff329938b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/sign.hlo
@@ -0,0 +1,9 @@
+HloModule Sign
+ENTRY %Sign (val: f32[2,2]) -> f32[2,2] {
+  %val = f32[2,2]{1,0} parameter(0)
+  ROOT %sign = f32[2,2]{1,0} sign(f32[2,2]{1,0} %val)
+}
+
+//  CHECK: func @sign(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
+//  CHECK:   "xla_lhlo.sign"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
+//  CHECK: }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/tanh.hlo b/tensorflow/compiler/xla/service/mlir_gpu/tests/tanh.hlo
new file mode 100644
index 00000000000..d539b3002dc
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/tanh.hlo
@@ -0,0 +1,9 @@
+HloModule Tanh
+ENTRY %Tanh (val: f32[2,2]) -> f32[2,2] {
+  %val = f32[2,2]{1,0} parameter(0)
+  ROOT %tanh = f32[2,2]{1,0} tanh(f32[2,2]{1,0} %val)
+}
+
+//  CHECK: func @tanh(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]]) {
+//  CHECK:   "xla_lhlo.tanh"(%[[ARG0]], %[[ARG1]]) : ([[TYPE]], [[TYPE]]) -> ()
+//  CHECK: }
diff --git a/tensorflow/compiler/xla/tests/filecheck.cc b/tensorflow/compiler/xla/tests/filecheck.cc
index 5926ebece39..068d6dc8fca 100644
--- a/tensorflow/compiler/xla/tests/filecheck.cc
+++ b/tensorflow/compiler/xla/tests/filecheck.cc
@@ -30,24 +30,27 @@ namespace xla {
 
 StatusOr<bool> RunFileCheck(const std::string& input,
                             absl::string_view pattern) {
-  using tensorflow::io::JoinPath;
-
   // Generate an input file for the FileCheck pattern.
-  string pattern_path;
+  std::string pattern_path;
   auto env = tensorflow::Env::Default();
   if (!env->LocalTempFilename(&pattern_path)) {
     return tensorflow::errors::Internal("couldn't get a pattern file name");
   }
   TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(env, pattern_path, pattern));
 
+  return RunFileCheckWithPatternFile(input, pattern_path);
+}
+
+StatusOr<bool> RunFileCheckWithPatternFile(const std::string& input,
+                                           const std::string& pattern_file) {
   // Invoke FileCheck to check whether input matches `pattern`.
-  string file_check_path = tensorflow::GetDataDependencyFilepath(
-      JoinPath("external", "llvm-project", "llvm", "FileCheck"));
+  std::string file_check_path = tensorflow::GetDataDependencyFilepath(
+      tensorflow::io::JoinPath("external", "llvm-project", "llvm", "FileCheck"));
 
   tensorflow::SubProcess file_check_process;
   file_check_process.SetProgram(
       file_check_path,
-      {file_check_path, "-v", "-dump-input=fail", pattern_path});
+      {file_check_path, "-v", "-dump-input=fail", pattern_file});
   file_check_process.SetChannelAction(tensorflow::CHAN_STDIN,
                                       tensorflow::ACTION_PIPE);
   file_check_process.SetChannelAction(tensorflow::CHAN_STDERR,
@@ -56,7 +59,7 @@ StatusOr<bool> RunFileCheck(const std::string& input,
     return tensorflow::errors::Internal("couldn't start FileCheck");
   }
 
-  string standard_error;
+  std::string standard_error;
   int exit_status = file_check_process.Communicate(
       /*stdin_input=*/&input, /*stdout_output=*/nullptr,
       /*stderr_output=*/&standard_error);
@@ -64,6 +67,7 @@ StatusOr<bool> RunFileCheck(const std::string& input,
   // FileCheck returns 0 when the inputs match. If matching failed, log
   // the error message generated by FileCheck and the inputs.
   bool succeeded = (exit_status == 0);
+  auto env = tensorflow::Env::Default();
   if (!succeeded) {
     LOG(WARNING) << "Tried to execute FileCheck at " << file_check_path;
     if (!env->FileExists(file_check_path).ok()) {
@@ -71,8 +75,6 @@ StatusOr<bool> RunFileCheck(const std::string& input,
     }
 
     LOG(WARNING) << "FileCheck error:\n" << standard_error;
-    LOG(WARNING) << "FileCheck pattern was:";
-    XLA_LOG_LINES(tensorflow::WARNING, pattern);
   } else if (!standard_error.empty()) {
     LOG(INFO) << "FileCheck stderr:";
     XLA_LOG_LINES(tensorflow::INFO, standard_error);
diff --git a/tensorflow/compiler/xla/tests/filecheck.h b/tensorflow/compiler/xla/tests/filecheck.h
index 23f71c11b78..2723ccc2e9d 100644
--- a/tensorflow/compiler/xla/tests/filecheck.h
+++ b/tensorflow/compiler/xla/tests/filecheck.h
@@ -26,7 +26,14 @@ namespace xla {
 // Runs FileCheck with the given pattern over given input string. Provided that
 // FileCheck can execute, returns true if and only if FileCheck succeeded in
 // matching the input.
-StatusOr<bool> RunFileCheck(const string& input, absl::string_view pattern);
+StatusOr<bool> RunFileCheck(const std::string& input,
+                            absl::string_view pattern);
+
+// Runs FileCheck with the given pattern file over given input string. Provided
+// that FileCheck can execute, returns true if and only if FileCheck succeeded
+// in matching the input.
+StatusOr<bool> RunFileCheckWithPatternFile(const std::string& input,
+                                           const std::string& pattern_file);
 
 }  // namespace xla
 

From 1bd88eb052aa968f643b5ae79f89373a57e59f68 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 01:02:42 -0800
Subject: [PATCH 221/442] compat: Update forward compatibility horizon to
 2020-02-19

PiperOrigin-RevId: 295914844
Change-Id: Ib42d1dc7b6700a59ac1a4ad5744daba6929e4c8a
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index e889b989ce0..c6b49129920 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -31,7 +31,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 2, 18)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 2, 19)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From c0a428cf0e90f45afca1cc9f02c9ccaeb15e5976 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Wed, 19 Feb 2020 01:05:43 -0800
Subject: [PATCH 222/442] Add no_pip tags to gather_test.

This test was missed when we marked all other tests in this directory with the
no_pip tag.

PiperOrigin-RevId: 295915455
Change-Id: I4c8a34b6ccaa20bdf6804f63f6b4cbb1d466afa7
---
 tensorflow/compiler/tests/BUILD | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index cbe92235643..447446f2cdd 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -1414,7 +1414,10 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["gather_test.py"],
     python_version = "PY3",
-    tags = ["optonly"],
+    tags = [
+        "no_pip",
+        "optonly",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",

From ad1fd5b040a8da9a0542f09695cc90952dd66c51 Mon Sep 17 00:00:00 2001
From: Officium <iofficium@163.com>
Date: Wed, 19 Feb 2020 17:28:09 +0800
Subject: [PATCH 223/442] update mathjax for lbeta

---
 tensorflow/python/ops/special_math_ops.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index 540c101c225..a05a488408d 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -51,18 +51,24 @@ from tensorflow.python.util.tf_export import tf_export
 def lbeta(x, name=None):
   r"""Computes \\(ln(|Beta(x)|)\\), reducing along the last dimension.
 
-  Given one-dimensional `z = [z_0,...,z_{K-1}]`, we define
+  Given one-dimensional $z = [z_1,...,z_K]$, we define
 
-  $$Beta(z) = \prod_j Gamma(z_j) / Gamma(\sum_j z_j)$$
+  $$Beta(z) = \frac{\prod_j \Gamma(z_j)}{\Gamma(\sum_j z_j)},$$
 
-  And for `n + 1` dimensional `x` with shape `[N1, ..., Nn, K]`, we define
-  $$lbeta(x)[i1, ..., in] = Log(|Beta(x[i1, ..., in, :])|)$$.
+  where $\Gamma$ is the gamma function.
 
-  In other words, the last dimension is treated as the `z` vector.
+  And for $n + 1$ dimensional $x$ with shape $[N_1, ..., N_n, K]$, we define
 
-  Note that if `z = [u, v]`, then
-  \\(Beta(z) = int_0^1 t^{u-1} (1 - t)^{v-1} dt\\), which defines the
-  traditional bivariate beta function.
+  $$lbeta(x)[i_1, ..., i_n] = \log{|Beta(x[i_1, ..., i_n, :])|}.$$
+
+  In other words, the last dimension is treated as the $z$ vector.
+
+  Note that if $z = [u, v]$, then
+
+  $$Beta(z) = \frac{\Gamma(u)\Gamma(v)}{\Gamma(u + v)}
+    = \int_0^1 t^{u-1} (1 - t)^{v-1} \mathrm{d}t,$$
+
+  which defines the traditional bivariate beta function.
 
   If the last dimension is empty, we follow the convention that the sum over
   the empty set is zero, and the product is one.

From 1aab76870995602101bc2b6a8d4f4d63fd37381e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 01:36:07 -0800
Subject: [PATCH 224/442] Automated rollback of commit
 e7de2ea3cf237305cb4f38da1c9a371596e2a139

PiperOrigin-RevId: 295919287
Change-Id: I3349e4ccca577f9e766f382814ddd3270354d295
---
 .../compiler/jit/mark_for_compilation_pass.cc |   2 -
 tensorflow/compiler/tests/BUILD               |  15 ---
 .../compiler/tests/searchsorted_op_test.py    |  75 -----------
 tensorflow/compiler/tf2xla/kernels/BUILD      |   2 -
 .../tf2xla/kernels/lower_upper_bound_ops.cc   | 116 ------------------
 5 files changed, 210 deletions(-)
 delete mode 100644 tensorflow/compiler/tests/searchsorted_op_test.py
 delete mode 100644 tensorflow/compiler/tf2xla/kernels/lower_upper_bound_ops.cc

diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index b36fe6ae5e9..08dc1b13db6 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -1911,7 +1911,6 @@ absl::flat_hash_set<string> GetKnownXLAWhitelistOp() {
                                      "LinSpace",
                                      "ListDiff",
                                      "LogMatrixDeterminant",
-                                     "LowerBound",
                                      "MatMul",
                                      "MatrixBandPart",
                                      "MatrixDiag",
@@ -2038,7 +2037,6 @@ absl::flat_hash_set<string> GetKnownXLAWhitelistOp() {
                                      "TensorScatterUpdate",
                                      "TridiagonalSolve",
                                      "TruncatedNormal",
-                                     "UpperBound",
                                      "UnsortedSegmentMax",
                                      "UnsortedSegmentMin",
                                      "UnsortedSegmentProd",
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 447446f2cdd..e3a62b3fa7b 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -335,21 +335,6 @@ tf_xla_py_test(
     ],
 )
 
-tf_xla_py_test(
-    name = "searchsorted_op_test",
-    size = "small",
-    timeout = "moderate",
-    srcs = ["searchsorted_op_test.py"],
-    python_version = "PY3",
-    tags = [
-        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
-    ],
-    deps = [
-        ":xla_test",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 tf_xla_py_test(
     name = "svd_op_test",
     size = "medium",
diff --git a/tensorflow/compiler/tests/searchsorted_op_test.py b/tensorflow/compiler/tests/searchsorted_op_test.py
deleted file mode 100644
index d77bd0902d3..00000000000
--- a/tensorflow/compiler/tests/searchsorted_op_test.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test for XLA implementation of tf.searchsorted."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.compiler.tests import xla_test
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-
-
-class SearchSorteddOpTest(xla_test.XLATestCase):
-
-  def test1D(self):
-    # Test against NumPy implementation (which is 1D only).
-    np.random.seed(1)
-    for side in ['left', 'right']:
-      for dtype in [np.float32, np.int32]:
-        values = np.random.uniform(
-            low=-1000, high=1000, size=(10,)).astype(dtype)
-        unsorted = np.random.uniform(
-            low=-1000, high=1000, size=(20,)).astype(dtype)
-
-        sorted_sequence = np.sort(unsorted)
-        np_ans = np.searchsorted(sorted_sequence, values, side=side)
-
-        with self.session() as session:
-          with self.test_scope():
-            tf_ans = array_ops.searchsorted(sorted_sequence, values, side=side)
-          tf_out = session.run(tf_ans)
-          self.assertAllEqual(np_ans, tf_out)
-
-  def _test2DExample(self, dtype, side, sorted_sequence, values, correct_ans):
-
-    with self.session() as session:
-      with self.test_scope():
-        tf_ans = array_ops.searchsorted(sorted_sequence, values, side=side)
-      tf_out = session.run(tf_ans)
-      self.assertAllEqual(correct_ans, tf_out)
-
-  def testLowerBound2DExample(self):
-    # 2D TensorFlow documentation example.
-    for dtype in self.float_types | self.int_types:
-      sorted_sequence = np.array([[0, 3, 9, 9, 10], [1, 2, 3, 4, 5]], dtype)
-      values = np.array([[2, 4, 9], [0, 2, 6]], dtype)
-      correct_ans = np.array([[1, 2, 2], [0, 1, 5]], dtype)
-      self._test2DExample(dtype, 'left', sorted_sequence, values, correct_ans)
-
-  def testUpperBound2DExample(self):
-    # 2D TensorFlow documentation example.
-    for dtype in self.float_types | self.int_types:
-      sorted_sequence = np.array([[0, 3, 9, 9, 10], [1, 2, 3, 4, 5]], dtype)
-      values = np.array([[2, 4, 9], [0, 2, 6]], dtype)
-      correct_ans = np.array([[1, 2, 4], [0, 2, 5]], dtype)
-      self._test2DExample(dtype, 'right', sorted_sequence, values, correct_ans)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 5f1c2f28ba4..8571c503299 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -55,7 +55,6 @@ tf_kernel_library(
         "index_ops.cc",
         "l2loss_op.cc",
         "listdiff_op.cc",
-        "lower_upper_bound_ops.cc",
         "lrn_ops.cc",
         "matmul_op.cc",
         "matrix_band_part_op.cc",
@@ -150,7 +149,6 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla/lib:util",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:array4d",
-        "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
diff --git a/tensorflow/compiler/tf2xla/kernels/lower_upper_bound_ops.cc b/tensorflow/compiler/tf2xla/kernels/lower_upper_bound_ops.cc
deleted file mode 100644
index 0eacf8812f1..00000000000
--- a/tensorflow/compiler/tf2xla/kernels/lower_upper_bound_ops.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_helpers.h"
-#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
-#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/comparison_util.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-
-namespace tensorflow {
-namespace {
-
-// Builds a LowerBound or UpperBound op, the distinction lying in
-// comparison_direction: GT => LowerBoundOp, GE => UpperBoundOp.
-// Note that this is an O(MN) algorithm: all entries in each sorted_inputs row
-// are considered, and their sorted nature is not fully exploited.
-void BuildLowerUpperBoundOp(XlaOpKernelContext* ctx, DataType out_dtype,
-                            xla::ComparisonDirection comparison_direction) {
-  const TensorShape sorted_inputs_shape = ctx->InputShape("sorted_inputs");
-  const TensorShape values_shape = ctx->InputShape("values");
-  const xla::XlaOp sorted_inputs = ctx->Input("sorted_inputs");
-  const xla::XlaOp values = ctx->Input("values");
-
-  // We are assuming both inputs are 2D, which they will be given the current
-  // implementation of tf.searchsorted.
-  OP_REQUIRES(ctx, sorted_inputs_shape.dims() == 2,
-              errors::FailedPrecondition("sorted_inputs must be 2D"));
-  OP_REQUIRES(ctx, values_shape.dims() == 2,
-              errors::FailedPrecondition("values must be 2D"));
-
-  // Add a new inner dimension to values, to allow broadcasting along the inner
-  // dimension of sorted_sequence.
-  auto new_values_shape = values_shape;
-  new_values_shape.InsertDim(/* d */ 2, /* size */ 1);
-  auto values_reshaped = xla::Reshape(values, new_values_shape.dim_sizes());
-
-  // Add a new penultimate dimension to sorted_inputs, to allow broadcasting of
-  // sorted_sequence entries for each value.
-  auto new_sorted_inputs_shape = sorted_inputs_shape;
-  new_sorted_inputs_shape.InsertDim(/* d */ 1, /* size */ 1);
-  auto sorted_inputs_reshaped =
-      xla::Reshape(sorted_inputs, new_sorted_inputs_shape.dim_sizes());
-
-  // We are relying on broadcasting to compare each value against each entry in
-  // the associated sorted_inputs row.
-  // The reshapes above leave the tensors with equal rank of 3, so broadcast
-  // dimensions are not explicitly specified.
-  auto comparison = xla::Compare(values_reshaped, sorted_inputs_reshaped, {},
-                                 comparison_direction);
-
-  const DataType accumulation_type = XlaHelpers::SumAccumulationType(out_dtype);
-
-  // Convert boolean comparison results to integers so we can sum them.
-  auto comparison_int =
-      XlaHelpers::ConvertElementType(comparison, accumulation_type);
-
-  // Sum the comparison results over the inner dimension to find the index for
-  // each value.
-  xla::XlaBuilder* builder = ctx->builder();
-  auto reduced =
-      xla::Reduce(comparison_int, XlaHelpers::Zero(builder, accumulation_type),
-                  *ctx->GetOrCreateAdd(accumulation_type), {2});
-
-  ctx->SetOutput(0, reduced);
-}
-
-class LowerBoundOp : public XlaOpKernel {
- public:
-  explicit LowerBoundOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("out_type", &out_dtype_));
-  }
-
-  void Compile(XlaOpKernelContext* ctx) override {
-    BuildLowerUpperBoundOp(ctx, out_dtype_, xla::ComparisonDirection::kGt);
-  }
-
- private:
-  DataType out_dtype_;
-};
-
-REGISTER_XLA_OP(Name("LowerBound"), LowerBoundOp);
-
-class UpperBoundOp : public XlaOpKernel {
- public:
-  explicit UpperBoundOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("out_type", &out_dtype_));
-  }
-
-  void Compile(XlaOpKernelContext* ctx) override {
-    BuildLowerUpperBoundOp(ctx, out_dtype_, xla::ComparisonDirection::kGe);
-  }
-
- private:
-  DataType out_dtype_;
-};
-
-REGISTER_XLA_OP(Name("UpperBound"), UpperBoundOp);
-
-}  // namespace
-}  // namespace tensorflow

From ca5bef0dcc0dcde2294c2f8b5cb1ca4ad3e2f9cf Mon Sep 17 00:00:00 2001
From: David Rim <davidrim@google.com>
Date: Wed, 19 Feb 2020 01:51:05 -0800
Subject: [PATCH 225/442] Update hybrid per channel conv to use optimized
 version of MatrixBatchVectorMultiply

PiperOrigin-RevId: 295921158
Change-Id: I64d9aacffb30ce7d6f84e45bbcca497a27c24233
---
 tensorflow/lite/kernels/conv.cc               |  60 +++-
 .../kernels/internal/optimized/im2col_utils.h |  23 +-
 .../internal/optimized/neon_tensor_utils.cc   | 308 +++++++++++++++---
 .../internal/optimized/neon_tensor_utils.h    |  19 ++
 .../optimized/neon_tensor_utils_impl.h        |  11 +
 .../internal/optimized/optimized_ops.h        | 110 +++++--
 .../internal/optimized/sse_tensor_utils.cc    |   6 +-
 .../internal/optimized/sse_tensor_utils.h     |  19 ++
 .../reference/portable_tensor_utils.cc        |  74 ++++-
 .../reference/portable_tensor_utils.h         |  19 ++
 .../reference/portable_tensor_utils_impl.h    |  12 +
 .../lite/kernels/internal/tensor_utils.h      |  13 +
 .../kernels/internal/tensor_utils_test.cc     |  74 ++++-
 13 files changed, 627 insertions(+), 121 deletions(-)

diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index 06ac27a6451..8a3539df8d5 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -72,6 +72,8 @@ struct OpData {
   int scaling_factors_id = kTensorNotAllocated;
   int input_offset_id = kTensorNotAllocated;
   int accum_scratch_id = kTensorNotAllocated;
+  // Row sums are used to cache filter sums for hybrid zero-point calculations.
+  int row_sums_id = kTensorNotAllocated;
 
   TfLitePaddingValues padding;
   // The scaling factor from input to output (aka the 'real multiplier') can
@@ -94,13 +96,16 @@ struct OpData {
   int32_t input_quantized_index;
   int32_t scaling_factors_index;
   int32_t accum_scratch_index;
-
   int32_t input_offset_index;
+  int32_t row_sums_index;
+
   bool need_hwcn_weights = false;
   bool have_weights_been_transposed = false;
   bool need_im2col = false;
 
   bool supports_multithreaded_kernel = false;
+  bool is_hybrid_per_channel = false;
+  bool compute_hybrid_row_sums = true;
 };
 
 inline PaddingType RuntimePaddingType(TfLitePadding padding) {
@@ -278,6 +283,13 @@ static TfLiteStatus AllocateTemporaryTensorsIfRequired(TfLiteContext* context,
             context, context->AddTensors(context, 1, &data->input_offset_id));
       }
       ++temporaries_count;
+
+      data->row_sums_index = temporaries_count;
+      if (data->row_sums_id == kTensorNotAllocated) {
+        TF_LITE_ENSURE_OK(context,
+                          context->AddTensors(context, 1, &data->row_sums_id));
+      }
+      ++temporaries_count;
     }
   }
 
@@ -334,7 +346,6 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
       (input->type == kTfLiteFloat32 &&
        (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8));
 
-  bool is_hybrid_per_channel = false;
   if (is_hybrid && filter->type == kTfLiteInt8 &&
       filter->quantization.type == kTfLiteAffineQuantization &&
       filter->quantization.params &&
@@ -348,7 +359,7 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
     const float scale = affine_quantization->scale->data[0];
     for (int i = 1; i < affine_quantization->scale->size; i++) {
       if (affine_quantization->scale->data[i] != scale) {
-        is_hybrid_per_channel = true;
+        data->is_hybrid_per_channel = true;
         break;
       }
     }
@@ -362,7 +373,7 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
       (params->dilation_height_factor == 1);
 
   TF_LITE_ENSURE_STATUS(AllocateTemporaryTensorsIfRequired(
-      context, node, is_hybrid, is_hybrid_per_channel, kernel_type));
+      context, node, is_hybrid, data->is_hybrid_per_channel, kernel_type));
 
   int channels_in = filter->dims->data[3];
   int channels_out = filter->dims->data[0];
@@ -510,7 +521,7 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
                                                        accum_scratch_size));
     }
 
-    if (is_hybrid_per_channel) {
+    if (data->is_hybrid_per_channel) {
       const auto* affine_quantization =
           reinterpret_cast<TfLiteAffineQuantization*>(
               filter->quantization.params);
@@ -524,13 +535,27 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
       input_offsets->allocation_type = kTfLiteArenaRw;
       // See above comment for the need to allocate for height of inputs.
       const int height = NumElements(input) / channels_in;
-      int scaling_dims[1] = {height};
-      if (!TfLiteIntArrayEqualsArray(input_offsets->dims, 1, scaling_dims)) {
+      const int input_offset_dims[1] = {height};
+      if (!TfLiteIntArrayEqualsArray(input_offsets->dims, 1,
+                                     input_offset_dims)) {
         TfLiteIntArray* input_offsets_size = TfLiteIntArrayCreate(1);
-        input_offsets_size->data[0] = height;
+        input_offsets_size->data[0] = input_offset_dims[0];
         TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_offsets,
                                                          input_offsets_size));
       }
+      node->temporaries->data[data->row_sums_index] = data->row_sums_id;
+      TfLiteTensor* row_sums =
+          GetTemporary(context, node, data->row_sums_index);
+      row_sums->type = kTfLiteInt32;
+      row_sums->allocation_type = kTfLiteArenaRwPersistent;
+      // See above comment for the need to allocate for height of inputs.
+      const int row_sums_dims[1] = {channels_out};
+      if (!TfLiteIntArrayEqualsArray(row_sums->dims, 1, row_sums_dims)) {
+        TfLiteIntArray* row_sums_size = TfLiteIntArrayCreate(1);
+        row_sums_size->data[0] = row_sums_dims[0];
+        TF_LITE_ENSURE_OK(
+            context, context->ResizeTensor(context, row_sums, row_sums_size));
+      }
     }
   }
   return kTfLiteOk;
@@ -733,9 +758,8 @@ void EvalHybridPerChannel(TfLiteContext* context, TfLiteNode* node,
 
   const int input_size = NumElements(input) / SizeOfDimension(input, 0);
   const int batch_size = SizeOfDimension(input, 0);
-  const TfLiteTensor* input_quantized =
-      GetTemporary(context, node, data->input_quantized_index);
-  int8_t* quantized_input_ptr_batch = input_quantized->data.int8;
+  int8_t* quantized_input_ptr_batch = GetTensorData<int8_t>(
+      GetTemporary(context, node, data->input_quantized_index));
   float* scaling_factors_ptr = GetTensorData<float>(
       GetTemporary(context, node, data->scaling_factors_index));
   int32_t* input_offset_ptr = GetTensorData<int32_t>(
@@ -780,13 +804,21 @@ void EvalHybridPerChannel(TfLiteContext* context, TfLiteNode* node,
     case kGenericOptimized:
     case kMultithreadOptimized:
     case kCblasOptimized: {
+      TfLiteTensor* row_sums =
+          GetTemporary(context, node, data->row_sums_index);
+      TfLiteTensor* scratch =
+          GetTemporary(context, node, data->accum_scratch_index);
       optimized_ops::HybridConvPerChannel(
           op_params, scaling_factors_ptr, GetTensorShape(input),
           quantized_input_ptr_batch, GetTensorShape(filter), filter_ptr,
           GetTensorShape(bias), GetTensorData<float>(bias),
           GetTensorShape(output), GetTensorData<float>(output),
           GetTensorShape(im2col), im2col_ptr, affine_quantization->scale->data,
-          input_offset_ptr);
+          input_offset_ptr, GetTensorShape(scratch),
+          GetTensorData<int32>(scratch), GetTensorData<int32_t>(row_sums),
+          &data->compute_hybrid_row_sums,
+          CpuBackendContext::GetFromContext(context));
+      data->compute_hybrid_row_sums = false;
       break;
     }
   }
@@ -876,13 +908,11 @@ TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
     data->have_weights_been_transposed = true;
   }
 
-  bool is_hybrid_per_channel = data->input_offset_id != kTensorNotAllocated;
-
   TFLITE_DCHECK_EQ(input_type, input->type);
   switch (input_type) {  // Already know in/outtypes are same.
     case kTfLiteFloat32:
       if (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8) {
-        if (is_hybrid_per_channel) {
+        if (data->is_hybrid_per_channel) {
           EvalHybridPerChannel<kernel_type>(context, node, params, data, input,
                                             filter, bias, im2col, output);
         } else {
diff --git a/tensorflow/lite/kernels/internal/optimized/im2col_utils.h b/tensorflow/lite/kernels/internal/optimized/im2col_utils.h
index e15e2830e41..fcf9272689f 100644
--- a/tensorflow/lite/kernels/internal/optimized/im2col_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/im2col_utils.h
@@ -111,11 +111,12 @@ inline void ExtractPatchIntoBufferColumn(const RuntimeShape& input_shape, int w,
   }
 }
 
+// Supports per-batch zero_byte for per-batch asymmetric quantized inputs.
 template <typename T>
-void DilatedIm2col(const ConvParams& params, uint8 zero_byte,
-                   const RuntimeShape& input_shape, const T* input_data,
-                   const RuntimeShape& filter_shape,
-                   const RuntimeShape& output_shape, T* im2col_data) {
+void DilatedIm2col(const ConvParams& params, const RuntimeShape& input_shape,
+                   const T* input_data, const RuntimeShape& filter_shape,
+                   const RuntimeShape& output_shape, T* im2col_data,
+                   const int32_t* zero_bytes, const int zero_bytes_len) {
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
   const int dilation_width_factor = params.dilation_width_factor;
@@ -127,7 +128,7 @@ void DilatedIm2col(const ConvParams& params, uint8 zero_byte,
   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
 
   // For dilated convolution, the input pixels are not contiguous therefore we
-  // can't use the same opitimizations as Im2Col(). Though note this code would
+  // can't use the same optimizations as Im2Col(). Though note this code would
   // work fine for the non-dilated case too (though likely a bit slower).
   ruy::profiler::ScopeLabel label("DilatedIm2col");
   TFLITE_DCHECK(dilation_width_factor != 1 || dilation_height_factor != 1);
@@ -153,6 +154,8 @@ void DilatedIm2col(const ConvParams& params, uint8 zero_byte,
 
   // Loop through the output rows (B x H x W)
   for (int batch = 0; batch < batches; ++batch) {
+    const T zero_byte = zero_bytes_len > 1 ? static_cast<T>(zero_bytes[batch])
+                                           : static_cast<T>(zero_bytes[0]);
     for (int out_y = 0; out_y < output_height; ++out_y) {
       for (int out_x = 0; out_x < output_width; ++out_x) {
         // Each im2col row is an output pixel. Arrange the input data in this
@@ -194,6 +197,16 @@ void DilatedIm2col(const ConvParams& params, uint8 zero_byte,
   }
 }
 
+template <typename T>
+void DilatedIm2col(const ConvParams& params, uint8 zero_byte,
+                   const RuntimeShape& input_shape, const T* input_data,
+                   const RuntimeShape& filter_shape,
+                   const RuntimeShape& output_shape, T* im2col_data) {
+  const int32_t zero_point = static_cast<int32_t>(zero_byte);
+  DilatedIm2col<T>(params, input_shape, input_data, filter_shape, output_shape,
+                   im2col_data, &zero_point, 1);
+}
+
 template <typename T>
 void Im2col(const ConvParams& params, int kheight, int kwidth, uint8 zero_byte,
             const RuntimeShape& input_shape, const T* input_data,
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index 6ab57c9a7df..8e0c77a8d5c 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -456,13 +456,14 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* vectors, const float* scaling_factors, int n_batch,
     float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset) {
+    const int32_t* input_offset, int32_t* row_sums) {
   void* shuffled_vectors_free;
   const int8_t* shuffled_vectors =
       ShuffleVectors(vectors, n_batch, m_cols, &shuffled_vectors_free);
 
   for (int row = 0; row < m_rows; row += 2) {
     const float* channel_scales_ptr = per_channel_scale + row;
+    int32_t* row_sums_ptr = row_sums ? row_sums + row : nullptr;
     for (int batch = 0; batch < n_batch; batch += 4) {
       float* result_ptr = result + (batch * m_rows) + row;
       const int8* mat_ptr0 = matrix + (row * m_cols);
@@ -472,7 +473,8 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
       const float* scaling_factors_ptr = scaling_factors + batch;
       const uint64_t wide_rows = m_rows * sizeof(float);
       const int32_t* batch_offsets_ptr = input_offset + batch;
-
+      const int32_t is_channel_scale_nullptr = per_channel_scale == nullptr;
+      const int32_t is_row_sums_nullptr = row_sums_ptr == nullptr;
       asm volatile(
           "dup v0.4s, wzr\n"
           "dup v1.4s, wzr\n"
@@ -480,16 +482,23 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
           "dup v3.4s, wzr\n"
           // Load zero points.
           "ld1 {v7.4s}, [%[batch_offsets_ptr]]\n"
-
+          "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
           // Zero out zero point accumulators.
           "dup v14.4s, wzr\n"
           "dup v15.4s, wzr\n"
 
-          // Load per channel scales
+          // Load per channel scales if not null.
+          "cmp %w[is_channel_scale_nullptr], #0\n"
+          "bne 1f\n"
           "ld1r {v16.4s}, [%[channel_scales_ptr]], #4\n"
           "ld1r {v17.4s}, [%[channel_scales_ptr]]\n"
-
+          "fmul v16.4s, v16.4s, v4.4s\n"
+          "fmul v17.4s, v17.4s, v4.4s\n"
+          "b 2f\n"
           "1:\n"
+          "mov v16.4s, v4.4s\n"
+          "mov v17.4s, v4.4s\n"
+          "2:\n"
           "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
           "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
           ".word 0x4f8ce100  // sdot v0.4s, v8.16b, v12.4b[0]\n"
@@ -504,25 +513,32 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
           ".word 0x4fade123  // sdot v3.4s, v9.16b, v13.4b[1]\n"
           ".word 0x4f8de942  // sdot v2.4s, v10.16b, v13.4b[2]\n"
           ".word 0x4fade963  // sdot v3.4s, v11.16b, v13.4b[3]\n"
-
+          "cmp %w[is_row_sums_nullptr], #1\n"
+          "bne 3f\n"
           // Accumulate row_sums for zero point calculations.
           "saddlp v12.8h, v12.16b\n"
           "saddlp v13.8h, v13.16b\n"
           "sadalp v14.4s, v12.8h\n"
           "sadalp v15.4s, v13.8h\n"
-
+          "3:\n"
           "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
-          "bne 1b\n"
+          "bne 2b\n"
           "add v0.4s, v0.4s, v1.4s\n"
           "add v2.4s, v2.4s, v3.4s\n"
 
+          "cmp %w[is_row_sums_nullptr], #1\n"
+          "bne 4f\n"
           // Calculate zero point offsets.
-          "addv s12, v14.4s\n"
-          "addv s13, v15.4s\n"
-          "fmov w0, s12\n"
-          "fmov w1, s13\n"
-          "dup v14.4s, w0\n"
-          "dup v15.4s, w1\n"
+          "addv s14, v14.4s\n"
+          "addv s15, v15.4s\n"
+          "dup v14.4s, v14.s[0]\n"
+          "dup v15.4s, v15.s[0]\n"
+          "b 5f\n"
+          "4:\n"
+          "ld1r {v14.4s}, [%[row_sums_ptr]], #4\n"
+          "ld1r {v15.4s}, [%[row_sums_ptr]]\n"
+          "5:\n"
+
           "mul v14.4s, v14.4s, v7.4s\n"
           "mul v15.4s, v15.4s, v7.4s\n"
           "sub v0.4s, v0.4s, v14.4s\n"
@@ -530,11 +546,8 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
 
           "scvtf v0.4s, v0.4s\n"
           "scvtf v1.4s, v2.4s\n"
-          "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
-          "fmul v0.4s, v4.4s, v0.4s\n"
-          "fmul v1.4s, v4.4s, v1.4s\n"
 
-          // Multiply channel scales.
+          // Multiply scale.
           "fmul v0.4s, v16.4s, v0.4s\n"
           "fmul v1.4s, v17.4s, v1.4s\n"
 
@@ -550,12 +563,15 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
           "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
           "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
           : [ mat_ptr0 ] "+r"(mat_ptr0), [ mat_ptr1 ] "+r"(mat_ptr1),
-            [ vec_ptr ] "+r"(vec_ptr), [ result_ptr ] "+r"(result_ptr)
+            [ vec_ptr ] "+r"(vec_ptr), [ result_ptr ] "+r"(result_ptr),
+            [ row_sums_ptr ] "+r"(row_sums_ptr)
           : [ mat_ptr0_end ] "r"(mat_ptr0_end),
             [ scaling_factors_ptr ] "r"(scaling_factors_ptr),
             [ wide_rows ] "r"(wide_rows),
             [ channel_scales_ptr ] "r"(channel_scales_ptr),
-            [ batch_offsets_ptr ] "r"(batch_offsets_ptr)
+            [ batch_offsets_ptr ] "r"(batch_offsets_ptr),
+            [ is_channel_scale_nullptr ] "r"(is_channel_scale_nullptr),
+            [ is_row_sums_nullptr ] "r"(is_row_sums_nullptr)
           : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
             "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "w0", "w1",
             "cc", "memory");
@@ -565,6 +581,16 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
   free(shuffled_vectors_free);
 }
 
+static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* vectors, const float* scaling_factors, int n_batch,
+    float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset) {
+  DotprodMatrixBatchFourVectorMultiplyAccumulate(
+      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      per_channel_scale, input_offset, nullptr);
+}
+
 // The DotprodMatrixBatchFourVectorMultiplyAccumulate kernel processes 4
 // vectors in the same time as the baseline processes 1 vector. However, it
 // requires 4 vectors of input.
@@ -591,7 +617,8 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
 void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* vectors, const float* scaling_factors, int n_batch,
-    float* __restrict__ result) {
+    float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* row_sums) {
   const int kWeightsPerUint32 = 4;
 
   // Round to the nearest multiple of 4.
@@ -630,11 +657,30 @@ void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
   memset(padded_scaling_factors, 0, batch_round_up * sizeof(float));
   memcpy(padded_scaling_factors, scaling_factors, n_batch * sizeof(float));
 
-  // Call the main kernel.
-  DotprodMatrixBatchFourVectorMultiplyAccumulate(
-      matrix, m_rows, m_cols, padded_vectors, padded_scaling_factors,
-      batch_round_up, padded_result);
+  if (input_offset != nullptr) {
+    void* padded_input_offset_free;
+    const int padded_input_offset_size = batch_round_up * sizeof(int32_t);
+    int32_t* padded_input_offset = reinterpret_cast<int32_t*>(
+        aligned_alloc(kWeightsPerUint32, padded_input_offset_size,
+                      &padded_input_offset_free));
+    TFLITE_CHECK_LE(n_batch * sizeof(int32_t), padded_input_offset_size);
+    TFLITE_CHECK_LE(batch_round_up * sizeof(int32_t), padded_input_offset_size);
+    memset(padded_input_offset, 0, batch_round_up * sizeof(int32_t));
+    memcpy(padded_input_offset, input_offset, n_batch * sizeof(int32_t));
 
+    // Call the main kernel.
+    DotprodMatrixBatchFourVectorMultiplyAccumulate(
+        matrix, m_rows, m_cols, padded_vectors, padded_scaling_factors,
+        batch_round_up, padded_result, per_channel_scale, padded_input_offset,
+        row_sums);
+
+    free(padded_input_offset_free);
+  } else {
+    // Call the main kernel.
+    DotprodMatrixBatchFourVectorMultiplyAccumulate(
+        matrix, m_rows, m_cols, padded_vectors, padded_scaling_factors,
+        batch_round_up, padded_result);
+  }
   memcpy(result, padded_result, result_size);
 
   free(padded_result_free);
@@ -642,6 +688,16 @@ void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
   free(padded_scaling_factors_free);
 }
 
+void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* vectors, const float* scaling_factors, int n_batch,
+    float* __restrict__ result) {
+  DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
+      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr,
+      /*row_sums=*/nullptr);
+}
+
 static void DotprodSparseMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
     const int m_cols, const int8_t* __restrict__ vectors,
@@ -1211,18 +1267,25 @@ void NeonMatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
   }
 }
 
-void NeonMatrixBatchVectorMultiplyAccumulate(
+void NeonMatrixBatchVectorMultiplyAccumulateImpl(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
     int n_batch, float* __restrict__ result, int result_stride,
-    const float* per_channel_scale, const int32_t* input_offset) {
+    const float* per_channel_scale, const int32_t* input_offset,
+    int32_t* row_sums) {
 #ifdef __aarch64__
   if (HasSdotInstruction() && m_cols % 16 == 0 && m_rows % 2 == 0 &&
       m_rows >= n_batch) {
     if (n_batch % 4 == 0 && result_stride == 1) {
       DotprodMatrixBatchFourVectorMultiplyAccumulate(
           matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
-          per_channel_scale, input_offset);
+          per_channel_scale, input_offset, row_sums);
+      return;
+    } else if (result_stride == 1 && n_batch >= 2 &&
+               m_rows * m_cols >= 128 * 128) {
+      DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
+          matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+          per_channel_scale, input_offset, row_sums);
       return;
     }
   }
@@ -1248,6 +1311,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
 
   for (int batch = 0; batch < n_batch; ++batch) {
     const float batch_scaling_factor = scaling_factors[batch];
+    const int batch_input_offset = input_offset[batch];
     memcpy(aligned_vec, vectors + batch * m_cols, sizeof(int8_t) * m_cols);
     for (int row = 0; row < m_rows; ++row, result += result_stride) {
       int8_t* row_ptr = (int8_t*)matrix + row * m_cols;  // NOLINT
@@ -1255,65 +1319,171 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
         memcpy(aligned_row, row_ptr, sizeof(int8_t) * m_cols);
         row_ptr = aligned_row;
       }
+      float scale = batch_scaling_factor;
+      if (per_channel_scale) {
+        scale *= per_channel_scale[row];
+      }
+      // Initialize the dot product sum for the row to 0.
       int32x4_t dotprod_32x4 = vmovq_n_s32(0);
 
-      // Initialize row sums to 0.
-      int32x4_t row_sum_32x4 = vmovq_n_s32(0);
-
+      int32x4_t row_sum_32x4;
+      if (row_sums == nullptr) {
+        row_sum_32x4 = vmovq_n_s32(0);
+      }
+      // Prefetch the row to cache.
       __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
                          3 /* temporal locality */);
 
+      // For every block of 16 8-bit elements.
       int col = 0;
       for (; col < postamble_half_start; col += kWeightsPerNeonLane) {
+        // Load 16 8-bit values from the row and vector, each, to operate on.
+        // Here the assumption is that each buffer is 4-byte aligned. Otherwise,
+        // performance may suffer significantly.
         TFLITE_DCHECK_EQ(  // NOLINT
             (uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1), 0);
         const int8x16_t s1_8x16 = vld1q_s8((const int8_t*)(aligned_vec + col));
         const int8x16_t s2_8x16 = vld1q_s8((const int8_t*)(row_ptr + col));
+        // Multiply the low bits (i.e. the lower 8 8bit numbers in the
+        // registers).
         int16x8_t prod_16x8 =
             vmull_s8(vget_low_s8(s1_8x16), vget_low_s8(s2_8x16));
+        // Multiply the high bits (i.e. the higher 8 8bit numbers in the
+        // registers), and accumulate with the result of the low bits product.
+        // The assumption here is that overflow will not happen as we quantize
+        // our values to be in the range [-127, 127]. As such the sum of the 2
+        // products is always strictly smaller than 15-bits (32767 in absolute
+        // value).
         prod_16x8 =
             vmlal_s8(prod_16x8, vget_high_s8(s1_8x16), vget_high_s8(s2_8x16));
-
         dotprod_32x4 = vpadalq_s16(dotprod_32x4, prod_16x8);
+        if (row_sums == nullptr) {
+          const int16x8_t row_sum_16x8 = vpaddlq_s8(s2_8x16);
+          row_sum_32x4 = vpadalq_s16(row_sum_32x4, row_sum_16x8);
+        }
+      }  // for col
 
-        // Compute the row sums.
-        const int16x8_t row_sum_16x8 = vpaddlq_s8(s2_8x16);
-        row_sum_32x4 = vpadalq_s16(row_sum_32x4, row_sum_16x8);
-      }
-
+      // Half iteration dealing only 8 elements
       if (col < postamble_start) {
+        // Load 8 8-bit values from the row and column each to operate on.
+        // Here the assumption is that each buffer is 4-bytes aligned.
+        // Otherwise, performance may suffer significantly.
         TFLITE_DCHECK_EQ(  // NOLINT
             (uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1), 0);
         const int8x8_t s1_8x8 = vld1_s8((const int8_t*)(aligned_vec + col));
         const int8x8_t s2_8x8 = vld1_s8((const int8_t*)(row_ptr + col));
         const int16x8_t prod_16x8 = vmull_s8(s1_8x8, s2_8x8);
         dotprod_32x4 = vpadalq_s16(dotprod_32x4, prod_16x8);
-
-        // Extend row values to 16 bit and add to the row sums.
-        const int16x8_t row_sum_16x8 = vmovl_s8(s2_8x8);
-        row_sum_32x4 = vpadalq_s16(row_sum_32x4, row_sum_16x8);
+        if (row_sums == nullptr) {
+          const int16x8_t row_sum_16x8 = vmovl_s8(s2_8x8);
+          row_sum_32x4 = vpadalq_s16(row_sum_32x4, row_sum_16x8);
+        }
         col += (kWeightsPerNeonLane >> 1);
       }
 
-      // Reduce to scalar and multiply the batch offset.
-      int32_t row_sum = AccumulateNeonLane(row_sum_32x4);
       int32_t dotprod = AccumulateNeonLane(dotprod_32x4);
+      int32_t row_sum = row_sums == nullptr ? AccumulateNeonLane(row_sum_32x4)
+                                            : row_sums[row];
+
+      // Postamble loop.
       for (; col < m_cols; ++col) {
         dotprod += row_ptr[col] * aligned_vec[col];
-        row_sum += row_ptr[col];
-      }
-      const int32_t batch_offset = input_offset[batch];
-      dotprod -= row_sum * batch_offset;
-      // Multipy the per-channel scale.
-      *result += dotprod * batch_scaling_factor * per_channel_scale[row];
-    }
-  }
+        if (row_sums == nullptr) {
+          row_sum += row_ptr[col];
+        }
+      }  // for col
+      dotprod -= row_sum * batch_input_offset;
+      *result += dotprod * scale;
+    }  // for row
+  }    // for batch
   if (unaligned) {
     free(aligned_row_free);
   }
   free(aligned_vec_free);
 }
 
+void NeonMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, int result_stride,
+    const float* per_channel_scale, const int32_t* input_offset,
+    int32_t* scratch, int32_t* row_sums, bool* compute_row_sums,
+    CpuBackendContext* context) {
+  if (compute_row_sums == nullptr || *compute_row_sums) {
+    memset(row_sums, 0, sizeof(int32_t) * m_rows);
+    NeonReductionSumVector(matrix, row_sums, m_rows, m_cols);
+    if (compute_row_sums) {
+      *compute_row_sums = false;
+    }
+  }
+
+#ifdef TFLITE_WITH_RUY_GEMV
+  if (m_rows % 4 == 0 && result_stride == 1) {
+    const int32_t* bias = static_cast<const int32_t*>(nullptr);
+    NeonCpuBackendGemm(vectors, bias, matrix, n_batch, m_cols, m_rows, 0,
+                       scratch, context);
+
+    // Multiply by float scaling factors and write to result
+    const int total_size = n_batch * m_rows;
+    int i = 0;
+    int32_t* scratch_ptr = scratch;
+    for (; i <= total_size - 8; i += 8, result += 8 * result_stride) {
+      float batch_scaling_factor0 = scaling_factors[i / m_rows];
+      float batch_scaling_factor1 = scaling_factors[(i + 4) / m_rows];
+      if (per_channel_scale) {
+        batch_scaling_factor0 *= per_channel_scale[i % m_rows];
+        batch_scaling_factor1 *= per_channel_scale[(i + 4) % m_rows];
+      }
+      const int batch_input_offset0 = -input_offset[i / m_rows];
+      const int batch_input_offset1 = -input_offset[(i + 4) / m_rows];
+      const float32x4_t scaling_factor0 = vdupq_n_f32(batch_scaling_factor0);
+      const float32x4_t scaling_factor1 = vdupq_n_f32(batch_scaling_factor1);
+      const int32x4_t input_offset0 = vdupq_n_s32(batch_input_offset0);
+      const int32x4_t input_offset1 = vdupq_n_s32(batch_input_offset1);
+      const int32x4_t row_sum0 = vld1q_s32(row_sums + (i % m_rows));
+      const int32x4_t row_sum1 = vld1q_s32(row_sums + ((i + 4) % m_rows));
+      const int32x4_t scratch_val0 = vld1q_s32(scratch_ptr + i);
+      const int32x4_t scratch_val1 = vld1q_s32(scratch_ptr + i + 4);
+      const int32x4_t dotprod0 =
+          vmlaq_s32(scratch_val0, row_sum0, input_offset0);
+      const int32x4_t dotprod1 =
+          vmlaq_s32(scratch_val1, row_sum1, input_offset1);
+      const float32x4_t float_val0 = vcvtq_f32_s32(dotprod0);
+      const float32x4_t float_val1 = vcvtq_f32_s32(dotprod1);
+      const float32x4_t result0 =
+          vmlaq_f32(vld1q_f32(result), float_val0, scaling_factor0);
+      const float32x4_t result1 = vmlaq_f32(
+          vld1q_f32(result + 4 * result_stride), float_val1, scaling_factor1);
+      vst1q_f32(result, result0);
+      vst1q_f32(result + 4 * result_stride, result1);
+    }
+
+    scratch_ptr += i;
+    for (; i < total_size; i++, result += result_stride) {
+      const float batch_scaling_factor = scaling_factors[i / m_rows];
+      const int32_t zero_point = input_offset[i / m_rows];
+      int32_t x = *(scratch_ptr++);
+      x -= row_sums[i % m_rows] * zero_point;
+      *result += x * batch_scaling_factor;
+    }
+    return;
+  }
+#endif
+  NeonMatrixBatchVectorMultiplyAccumulateImpl(
+      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      result_stride, per_channel_scale, input_offset, row_sums);
+}
+
+void NeonMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, int result_stride,
+    const float* per_channel_scale, const int32_t* input_offset) {
+  NeonMatrixBatchVectorMultiplyAccumulateImpl(
+      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      result_stride, per_channel_scale, input_offset, nullptr);
+}
+
 inline int64x2x2_t MulAdd(int32x4_t acc, int32x4_t lhs, int32x4_t rhs) {
   int64x2x2_t result;
   const int64x2_t lhs_low = vmovl_s32(vget_low_s32(lhs));
@@ -2201,8 +2371,10 @@ void NeonAsymmetricQuantizeFloats(const float* values, const int size,
   const double qmin_double = kMinScale;
   const double qmax_double = kMaxScale;
   if (rmin == rmax) {
-    *scaling_factor = 0;
+    memset(quantized_values, 0, size * sizeof(int8_t));
+    *scaling_factor = 1;
     *offset = 0;
+    return;
   } else {
     const double scale = (rmax - rmin) / (qmax_double - qmin_double);
     const double zero_point_from_min = qmin_double - rmin / scale;
@@ -2216,9 +2388,9 @@ void NeonAsymmetricQuantizeFloats(const float* values, const int size,
             ? zero_point_from_min
             : zero_point_from_max;
     int8 nudged_zero_point = 0;
-    if (zero_point_double < qmin_double) {
+    if (zero_point_double <= qmin_double) {
       nudged_zero_point = kMinScale;
-    } else if (zero_point_double > qmax_double) {
+    } else if (zero_point_double >= qmax_double) {
       nudged_zero_point = kMaxScale;
     } else {
       nudged_zero_point = static_cast<int8>(round(zero_point_double));
@@ -2320,6 +2492,34 @@ void NeonReductionSumVector(const float* input_vector, float* output_vector,
   }
 }
 
+void NeonReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                            const int output_size, const int reduction_size) {
+  constexpr int kWeightsPerNeonLane = 16;
+  const int postamble_half_start = reduction_size & ~(kWeightsPerNeonLane - 1);
+  const int postamble_start =
+      reduction_size & ~((kWeightsPerNeonLane >> 1) - 1);
+  for (int o = 0; o < output_size; ++o) {
+    // Get the address of the first element of the row.
+    int8_t* row_ptr = (int8_t*)input_vector + o * reduction_size;  // NOLINT
+    int32x4_t sum_32x4 = vmovq_n_s32(0);
+    int r = 0;
+    for (; r < postamble_half_start; r += kWeightsPerNeonLane) {
+      const int8x16_t s2_8x16 = vld1q_s8((const int8_t*)(row_ptr + r));
+      sum_32x4 = vpadalq_s16(sum_32x4, vpaddlq_s8(s2_8x16));
+    }
+    if (r < postamble_start) {
+      const int8x8_t s2_8x8 = vld1_s8((const int8_t*)(row_ptr + r));
+      sum_32x4 = vpadalq_s16(sum_32x4, vmovl_s8(s2_8x8));
+      r += (kWeightsPerNeonLane >> 1);
+    }
+    int32_t sum = AccumulateNeonLane(sum_32x4);
+    for (; r < reduction_size; ++r) {
+      sum += row_ptr[r];
+    }
+    output_vector[o] += sum;
+  }
+}
+
 }  // namespace tensor_utils
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
index 23158a37e0a..f82926825ed 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -62,6 +62,19 @@ void MatrixBatchVectorMultiplyAccumulate(
                           result_stride, per_channel_scale, input_offset);
 }
 
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, int result_stride,
+    const float* per_channel_scale, const int32_t* input_offset,
+    int32_t* scratch, int32_t* row_sums, bool* compute_row_sums,
+    CpuBackendContext* context) {
+  return NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows,
+                          m_cols, vectors, scaling_factors, n_batch, result,
+                          result_stride, per_channel_scale, input_offset,
+                          scratch, row_sums, compute_row_sums, context);
+}
+
 void SparseMatrixBatchVectorMultiplyAccumulate(
     const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
     int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
@@ -236,6 +249,12 @@ void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
                              reduction_size);
 }
 
+void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                        int output_size, int reduction_size) {
+  NEON_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
+                   reduction_size);
+}
+
 void MeanStddevNormalization(const float* input_vector, float* output_vector,
                              int v_size, int n_batch) {
   PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
index ea8955b9395..7b476d30092 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
@@ -52,6 +52,14 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
     int result_stride, CpuBackendContext* context);
 
 // Matrix multiplication for quantized values using asymmetric quantization.
+void NeonMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, int result_stride,
+    const float* per_channel_scale, const int32_t* input_offset,
+    int32_t* scratch, int32_t* row_sums, bool* compute_row_sums,
+    CpuBackendContext* context);
+
 void NeonMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
@@ -162,6 +170,9 @@ void NeonAsymmetricQuantizeFloats(const float* values, const int size,
 void NeonReductionSumVector(const float* input_vector, float* output_vector,
                             int output_size, int reduction_size);
 
+void NeonReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                            int output_size, int reduction_size);
+
 #endif  // USE_NEON
 
 }  // namespace tensor_utils
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index abb712ddf60..7149cfaaaeb 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -1331,69 +1331,109 @@ inline void HybridConvPerChannel(
     const RuntimeShape& bias_shape, const float* bias_data,
     const RuntimeShape& output_shape, float* output_data,
     const RuntimeShape& im2col_shape, int8_t* im2col_data,
-    const float* per_channel_scale, int32_t* input_offset) {
+    const float* per_channel_scale, int32_t* input_offset,
+    const RuntimeShape& scratch_shape, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* cpu_backend_context) {
+  ruy::profiler::ScopeLabel label("ConvHybridPerChannel");
   const int stride_width = params.stride_width;
   const int stride_height = params.stride_height;
-  const float output_activation_min = params.float_activation_min;
-  const float output_activation_max = params.float_activation_max;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
 
-  const int batch_size = input_shape.Dims(0);
+  const int8* gemm_input_data = nullptr;
+  const RuntimeShape* gemm_input_shape = nullptr;
   const int filter_width = filter_shape.Dims(2);
   const int filter_height = filter_shape.Dims(1);
-
-  const int8_t* gemm_input_data = nullptr;
-  int num_input;
+  const bool need_dilated_im2col =
+      dilation_width_factor != 1 || dilation_height_factor != 1;
   const bool need_im2col = stride_width != 1 || stride_height != 1 ||
                            filter_width != 1 || filter_height != 1;
 
-  if (need_im2col) {
+  const int batch_size = input_shape.Dims(0);
+
+  if (need_dilated_im2col) {
     TFLITE_DCHECK(im2col_data);
+    optimized_ops::DilatedIm2col(params, input_shape, input_data, filter_shape,
+                                 output_shape, im2col_data, input_offset,
+                                 batch_size);
+    gemm_input_data = im2col_data;
+    gemm_input_shape = &im2col_shape;
+  } else if (need_im2col) {
     Im2col(params, filter_height, filter_width, input_offset, batch_size,
            input_shape, input_data, im2col_shape, im2col_data);
     gemm_input_data = im2col_data;
-    num_input = im2col_shape.FlatSize();
+    gemm_input_shape = &im2col_shape;
   } else {
     TFLITE_DCHECK(!im2col_data);
     gemm_input_data = input_data;
-    num_input = input_shape.FlatSize();
+    gemm_input_shape = &input_shape;
   }
 
   const int filter_rows = filter_shape.Dims(0);
   const int filter_cols = FlatSizeSkipDim(filter_shape, 0);
 
-  const int gemm_input_cols = filter_cols;
-  const int gemm_input_rows = num_input / gemm_input_cols;
+  const int gemm_input_rows = gemm_input_shape->Dims(3);
+  const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_shape, 3);
+  const int output_rows = output_shape.Dims(3);
+  const int output_cols =
+      output_shape.Dims(0) * output_shape.Dims(1) * output_shape.Dims(2);
 
-  const int output_cols = output_shape.Dims(3);
-  const int output_rows = FlatSizeSkipDim(output_shape, 3);
-  TFLITE_DCHECK_EQ(output_cols, filter_rows);
-  TFLITE_DCHECK_EQ(output_rows, gemm_input_rows);
-  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_cols);
-
-  const int rows_per_batch = gemm_input_rows / batch_size;
-
-  // MatrixBatchVectorMultiplyAccumulate assumes that each row of the second
-  // input matrix has its own scale factor and zero point.
-  // This code duplicates the scale factors and zero point for each row in the
-  // same batch.
-  for (int i = gemm_input_rows - 1; i >= 0; --i) {
-    scaling_factors_ptr[i] = scaling_factors_ptr[i / rows_per_batch];
-    input_offset[i] = input_offset[i / rows_per_batch];
+  TFLITE_DCHECK_EQ(output_rows, filter_rows);
+  TFLITE_DCHECK_EQ(output_cols, gemm_input_cols);
+  TFLITE_DCHECK_EQ(filter_cols, gemm_input_rows);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
+  TFLITE_DCHECK_EQ(scratch_shape.FlatSize(), output_shape.FlatSize());
+  if (!compute_row_sums || *compute_row_sums) {
+    memset(row_sums, 0, sizeof(int32_t) * filter_rows);
+    tensor_utils::ReductionSumVector(filter_data, row_sums, filter_rows,
+                                     filter_cols);
+    if (compute_row_sums) {
+      *compute_row_sums = false;
+    }
   }
 
-  std::fill_n(output_data, output_rows * output_cols, 0.0f);
+  cpu_backend_gemm::MatrixParams<int8> lhs_params;
+  lhs_params.rows = filter_rows;
+  lhs_params.cols = filter_cols;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
 
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      filter_data, filter_rows, filter_cols, gemm_input_data,
-      scaling_factors_ptr, /*n_batch=*/gemm_input_rows, output_data,
-      /*result_stride=*/1, per_channel_scale, input_offset);
+  cpu_backend_gemm::MatrixParams<int8> rhs_params;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.rows = gemm_input_rows;
+  rhs_params.cols = gemm_input_cols;
 
-  AddBiasAndEvalActivationFunction(output_activation_min, output_activation_max,
-                                   bias_shape, bias_data, output_shape,
-                                   output_data);
+  cpu_backend_gemm::MatrixParams<int32> dst_params;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.rows = output_rows;
+  dst_params.cols = output_cols;
+
+  // TODO(b/149003801): Use hybrid gemm once supported in Ruy.
+  cpu_backend_gemm::GemmParams<int32_t, int32_t> gemm_params;
+  cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, gemm_input_data,
+                         dst_params, scratch, gemm_params, cpu_backend_context);
+
+  MatrixMap<float> out_mat(output_data, filter_rows, output_cols);
+  MatrixMap<int32_t> in_mat(scratch, filter_rows, output_cols);
+  VectorMap<const float> bias_data_vec(bias_data, filter_rows, 1);
+  VectorMap<int32_t> row_sums_vec(row_sums, filter_rows, 1);
+  VectorMap<const float> per_channel_scale_vec(per_channel_scale, filter_rows,
+                                               1);
+  const int cols_per_batch = output_cols / batch_size;
+  for (int c = 0; c < output_cols; c++) {
+    const int b = c / cols_per_batch;
+    const float input_scale = scaling_factors_ptr[b];
+    const int32_t zero_point = input_offset[b];
+    out_mat.col(c) =
+        (((in_mat.col(c) - (row_sums_vec * zero_point))
+              .cast<float>()
+              .cwiseProduct((per_channel_scale_vec * input_scale))) +
+         bias_data_vec)
+            .cwiseMin(params.float_activation_max)
+            .cwiseMax(params.float_activation_min);
+  }
 }
 
 inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
index 59e6ab5594f..05d1be90ef0 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
@@ -138,6 +138,10 @@ void SseMatrixBatchVectorMultiplyAccumulate(
     const float batch_scaling_factor = scaling_factors[batch];
     for (int row = 0; row < m_rows; ++row, result += result_stride) {
       const int8_t* __restrict__ row_ptr = matrix + row * m_cols;
+      float scale = batch_scaling_factor;
+      if (per_channel_scale != nullptr) {
+        scale *= per_channel_scale[row];
+      }
       __m128i dotprod_32x4 = _mm_setzero_si128();
       __m128i row_sum_16x8 = _mm_setzero_si128();
       int col = 0;
@@ -167,7 +171,7 @@ void SseMatrixBatchVectorMultiplyAccumulate(
         row_sum += row_ptr[col];
       }  // for col
       sum -= row_sum * input_offset[batch];
-      *result += sum * batch_scaling_factor * per_channel_scale[row];
+      *result += sum * scale;
     }  // for row
     vectors += m_cols;
   }  // for batch
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
index c747ba9b520..a0cbcd2d9bf 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
@@ -54,6 +54,19 @@ void MatrixBatchVectorMultiplyAccumulate(
                   vectors, scaling_factors, n_batch, result, result_stride);
 }
 
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, int result_stride,
+    const float* per_channel_scale, const int32_t* input_offset,
+    int32_t* scratch, int32_t* row_sums, bool* compute_row_sums,
+    CpuBackendContext* context) {
+  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+                   vectors, scaling_factors, n_batch, result, result_stride,
+                   per_channel_scale, input_offset, scratch, row_sums,
+                   compute_row_sums, context);
+}
+
 void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
@@ -250,6 +263,12 @@ void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
                              reduction_size);
 }
 
+void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                        int output_size, int reduction_size) {
+  NEON_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
+                   reduction_size);
+}
+
 void MeanStddevNormalization(const float* input_vector, float* output_vector,
                              int v_size, int n_batch) {
   PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index d04fbf3be66..5d7907b20ef 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -77,15 +77,16 @@ void PortableAsymmetricQuantizeFloats(const float* values, const int size,
   const int32_t kMaxScale = 127;
   const double qmin_double = kMinScale;
   const double qmax_double = kMaxScale;
-  float rmin = 0.0, rmax = 0.0;
   const auto minmax = std::minmax_element(values, values + size);
-  rmin = rmin < *minmax.first ? rmin : *minmax.first;
-  rmax = rmax > *minmax.second ? rmax : *minmax.second;
+  const double rmin = std::fmin(0, *minmax.first);
+  const double rmax = std::fmax(0, *minmax.second);
   if (rmin == rmax) {
-    *scaling_factor = 0;
+    memset(quantized_values, 0, size * sizeof(int8_t));
+    *scaling_factor = 1;
     *offset = 0;
+    return;
   } else {
-    const double scale = (rmax - rmin) / (qmax_double - qmin_double);
+    double scale = (rmax - rmin) / (qmax_double - qmin_double);
     const double zero_point_from_min = qmin_double - rmin / scale;
     const double zero_point_from_max = qmax_double - rmax / scale;
     const double zero_point_from_min_error =
@@ -97,9 +98,9 @@ void PortableAsymmetricQuantizeFloats(const float* values, const int size,
             ? zero_point_from_min
             : zero_point_from_max;
     int8 nudged_zero_point = 0;
-    if (zero_point_double < qmin_double) {
+    if (zero_point_double <= qmin_double) {
       nudged_zero_point = kMinScale;
-    } else if (zero_point_double > qmax_double) {
+    } else if (zero_point_double >= qmax_double) {
       nudged_zero_point = kMaxScale;
     } else {
       nudged_zero_point = static_cast<int8>(round(zero_point_double));
@@ -107,8 +108,7 @@ void PortableAsymmetricQuantizeFloats(const float* values, const int size,
     *scaling_factor = scale;
     *offset = nudged_zero_point;
   }
-  const float scaling_factor_inv =
-      *scaling_factor == 0 ? 0 : 1.0 / *scaling_factor;
+  const float scaling_factor_inv = 1.0 / *scaling_factor;
   for (int i = 0; i < size; ++i) {
     const int32_t quantized_value = static_cast<int32_t>(
         TfLiteRound(*offset + values[i] * scaling_factor_inv));
@@ -172,6 +172,10 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
     const int8_t* row_ptr = matrix;
     for (int row = 0; row < m_rows; ++row, result += result_stride) {
       int32_t dotprod = 0;
+      float scale = batch_scaling_factor;
+      if (per_channel_scale) {
+        scale *= per_channel_scale[row];
+      }
 #if defined(__GNUC__)
       // Prefetch the row to cache.
       __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
@@ -180,7 +184,46 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
       for (int col = 0; col < m_cols; ++col, ++row_ptr) {
         dotprod += (*row_ptr) * (vectors[col] - batch_offset);
       }  // for col
-      *result += dotprod * batch_scaling_factor * per_channel_scale[row];
+      *result += dotprod * scale;
+    }  // for row
+  }    // for batch
+}
+
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, int result_stride,
+    const float* per_channel_scale, const int32_t* input_offset,
+    int32_t* scratch, int32_t* row_sums, bool* compute_row_sums,
+    CpuBackendContext* context) {
+  if (!compute_row_sums || *compute_row_sums) {
+    memset(row_sums, 0, sizeof(int32_t) * m_rows);
+    PortableReductionSumVector(matrix, row_sums, m_rows, m_cols);
+    if (compute_row_sums) {
+      *compute_row_sums = false;
+    }
+  }
+
+  for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
+    const float batch_scaling_factor = scaling_factors[batch];
+    const float batch_offset = input_offset[batch];
+    const int8_t* row_ptr = matrix;
+    for (int row = 0; row < m_rows; ++row, result += result_stride) {
+      int32_t dotprod = 0;
+      float scale = batch_scaling_factor;
+      if (per_channel_scale) {
+        scale *= per_channel_scale[row];
+      }
+#if defined(__GNUC__)
+      // Prefetch the row to cache.
+      __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
+                         3 /* temporal locality */);
+#endif
+      for (int col = 0; col < m_cols; ++col, ++row_ptr) {
+        dotprod += (*row_ptr) * vectors[col];
+      }  // for col
+      dotprod -= row_sums[row] * batch_offset;
+      *result += dotprod * scale;
     }  // for row
   }    // for batch
 }
@@ -586,6 +629,17 @@ void PortableReductionSumVector(const int32_t* input_vector,
   }
 }
 
+void PortableReductionSumVector(const int8_t* input_vector,
+                                int32_t* output_vector, int output_size,
+                                int reduction_size) {
+  const int8_t* input_vector_ptr = input_vector;
+  for (int o = 0; o < output_size; o++) {
+    for (int r = 0; r < reduction_size; r++) {
+      output_vector[o] += *input_vector_ptr++;
+    }
+  }
+}
+
 void PortableMeanStddevNormalization(const float* input_vector,
                                      float* output_vector, int v_size,
                                      int n_batch) {
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index c3c9be5e70b..f5ae5ee173f 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -76,6 +76,19 @@ void MatrixBatchVectorMultiplyAccumulate(
                                               result_stride);
 }
 
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, int result_stride,
+    const float* per_channel_scale, const int32_t* input_offset,
+    int32_t* scratch, int32_t* row_sums, bool* compute_row_sums,
+    CpuBackendContext* context) {
+  PortableMatrixBatchVectorMultiplyAccumulate(
+      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      result_stride, per_channel_scale, input_offset, scratch, row_sums,
+      compute_row_sums, context);
+}
+
 void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vector, const float* scaling_factors,
@@ -241,6 +254,12 @@ void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
                              reduction_size);
 }
 
+void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                        int output_size, int reduction_size) {
+  PortableReductionSumVector(input_vector, output_vector, output_size,
+                             reduction_size);
+}
+
 void MeanStddevNormalization(const float* input_vector, float* output_vector,
                              int v_size, int n_batch) {
   PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
index 20e14bf6386..fb86aef1a19 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@@ -67,6 +67,14 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ vectors, const float* scaling_factors,
     int n_batch, float* __restrict__ result, int result_stride);
 
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, int result_stride,
+    const float* per_channel_scale, const int32_t* input_offset,
+    int32_t* scratch, int32_t* row_sums, bool* compute_row_sums,
+    CpuBackendContext* context);
+
 void PortableMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vector, const float* scaling_factors,
@@ -180,6 +188,10 @@ void PortableReductionSumVector(const int32_t* input_vector,
                                 int32_t* output_vector, int output_size,
                                 int reduction_size);
 
+void PortableReductionSumVector(const int8_t* input_vector,
+                                int32_t* output_vector, int output_size,
+                                int reduction_size);
+
 // Layer norm for each batch.
 void PortableMeanStddevNormalization(const float* input_vector,
                                      float* output_vector, int v_size,
diff --git a/tensorflow/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h
index b86789a3ca8..a939da1448e 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/tensor_utils.h
@@ -121,6 +121,15 @@ void MatrixBatchVectorMultiplyAccumulate(
     const float* __restrict__ per_channel_scale,
     const int32_t* __restrict__ input_offset);
 
+// Same as the function above except that can make use of cached row sums.
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, int result_stride,
+    const float* per_channel_scale, const int32_t* input_offset,
+    int32_t* scratch, int32_t* row_sums, bool* compute_row_sums,
+    CpuBackendContext* context);
+
 // Same as the function above, but the matrix is stored in block compressed
 // sparse row format with block pattern 1x16 which consists of two arrays:
 //   1. A matrix array stores non-zero blocks of the matrix in row major.
@@ -537,6 +546,10 @@ void ReductionSumVector(const float* input_vector, float* output_vector,
 void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
                         int output_size, int reduction_size);
 
+// Same as above but input is 8 bit integer.
+void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                        int output_size, int reduction_size);
+
 // Layer norm for each batch.
 void MeanStddevNormalization(const float* input_vector, float* output_vector,
                              int v_size, int n_batch);
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 4dd4004c981..5eaa0a9aebf 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -272,7 +272,7 @@ TEST(uKernels, AsymmetricQuantizeFloatsAllZerosTest) {
   int32_t test_offset;
   AsymmetricQuantizeFloats(input, kVectorSize, output, &test_scale,
                            &test_offset);
-  EXPECT_EQ(test_scale, 0);
+  EXPECT_EQ(test_scale, 1);
   EXPECT_EQ(test_offset, 0);
   EXPECT_THAT(output, testing::ElementsAreArray({0, 0, 0, 0, 0, 0, 0, 0, 0}));
 }
@@ -396,6 +396,78 @@ TEST(uKernels, QuantMatrixBatchVectorMultiplyAccumulate8x8_16Test) {
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }
 
+TEST(uKernels, HybridMatrixBatchVectorMultiplyAccumulate8x8_16Test) {
+  CpuBackendContext context;
+  const std::vector<int8_t> input = {
+      4,   -41, 5,   -41, 22,  17,  -30, 24,  13,  -47, 18,  9,   -11, -30, 16,
+      1,   -47, 12,  36,  -20, 27,  -3,  0,   -51, -31, 3,   -8,  -38, 43,  23,
+      12,  1,   11,  -23, -26, 23,  14,  -9,  -44, 22,  21,  -30, 3,   -47, -26,
+      -21, -24, 1,   -44, 34,  -11, -23, -28, 26,  -38, 19,  35,  9,   23,  6,
+      -42, -25, 28,  1,   4,   -41, 5,   -41, 22,  17,  -30, 24,  13,  -47, 18,
+      9,   -11, -30, 16,  1,   -47, 12,  36,  -20, 27,  -3,  0,   -51, -31, 3,
+      -8,  -38, 43,  23,  12,  1,   11,  -23, -26, 23,  14,  -9,  -44, 22,  21,
+      -30, 3,   -47, -26, -21, -24, 1,   -44, 34,  -11, -23, -28, 26,  -38, 19,
+      35,  9,   23,  6,   -42, -25, 28,  1,
+  };
+  const std::vector<int32_t> input_offsets = {1, 1, 1, 1};
+
+  const std::vector<float> scaling_factors = {
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+  };
+
+  const std::vector<int8_t> input_to_gate_weights = {
+      -10, -4,  -8,  16,  4,  -16, -1,  11,  1,   2,   -25, 19,  7,   9,   2,
+      1,   -24, -2,  10,  -7, 7,   -5,  -2,  3,   4,   3,   -4,  -7,  -11, -13,
+      -18, 2,   11,  10,  12, -9,  17,  -15, -5,  20,  -6,  -11, 2,   -6,  -18,
+      15,  4,   3,   4,   -9, -2,  -3,  -9,  -13, 17,  -21, 5,   3,   -12, 0,
+      -4,  9,   -5,  4,   10, -2,  8,   1,   -10, -6,  1,   -9,  10,  11,  -1,
+      -5,  4,   -7,  -4,  5,  -4,  4,   12,  -7,  -5,  -9,  -19, 6,   -4,  12,
+      -17, -22, 0,   9,   -4, 6,   -5,  5,   -8,  8,   3,   15,  -18, -18, 5,
+      3,   -12, 5,   -10, 7,  7,   7,   -9,  17,  2,   -11, -25, 3,   19,  -6,
+      7,   1,   7,   5,   -3, 11,  3,   8,   0,   -8,  8,   -2,  -2,  -12, 14,
+      -5,  7,   8,   16,  20, -16, -5,  -5,  9,   1,   -10, -6,  14,  10,  -12,
+      10,  -6,  5,   0,   3,  8,   -9,  -13, -2,  10,  4,   4,   -16, -17, -9,
+      16,  -5,  14,  -9,  -5, -12, 0,   17,  6,   -1,  11,  16,  -20, 1,   -11,
+      -1,  -10, -21, 13,  4,  -12, -7,  0,   -14, -6,  3,   12,  -4,  6,   -18,
+      -3,  -1,  14,  -8,  -6, -15, 5,   12,  -3,  -10, 4,   6,   13,  -5,  -20,
+      0,   3,   -3,  -7,  1,  2,   -10, 7,   -3,  6,   1,   -12, 6,   14,  -5,
+      -20, 0,   3,   -3,  -7, 1,   2,   -10, 7,   -3,  6,   1,   -12, 6,   15,
+      -5,  -20, 0,   3,   -3, -7,  1,   2,   -10, 7,   -3,  6,   1,   -12, 6,
+      16,
+  };
+
+  std::vector<int32_t> scratch(5 * 8, 0);
+  std::vector<float> output(4 * 8, 0);
+  int32_t* row_sums = scratch.data() + 8 * 4;
+  bool compute_row_sums = true;
+  MatrixBatchVectorMultiplyAccumulate(
+      input_to_gate_weights.data(), /*m_rows=*/8, /*m_cols=*/32, input.data(),
+      scaling_factors.data(), /*n_batch*/ 4, output.data(), 1, nullptr,
+      input_offsets.data(), scratch.data(), row_sums, &compute_row_sums,
+      &context);
+
+  const std::vector<float_t> expected_output = {
+      -228, 1548,  937, -166, -1164, -1578, -278,  303, 839,  -820,  132,
+      1733, -1858, 58,  -425, -587,  -228,  1548,  937, -166, -1164, -1578,
+      -278, 303,   839, -820, 132,   1733,  -1858, 58,  -425, -587,
+  };
+
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+  EXPECT_THAT(compute_row_sums, false);
+
+  std::vector<float> output2(4 * 8, 0);
+  MatrixBatchVectorMultiplyAccumulate(
+      input_to_gate_weights.data(), /*m_rows=*/8, /*m_cols=*/32, input.data(),
+      scaling_factors.data(), /*n_batch*/ 4, output2.data(), 1, nullptr,
+      input_offsets.data(), scratch.data(), row_sums, &compute_row_sums,
+      &context);
+
+  EXPECT_THAT(output2, testing::ElementsAreArray(expected_output));
+}
+
 // Qautnized matmul with 2 * 30 input and 9 * 30 matrix.
 TEST(uKernels, QuantMatrixBatchVectorMultiplyAccumulate8x8_8Test) {
   CpuBackendContext context;

From 7cb3d2541a658b0004d6fe6fa00d23876bbc0c7e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 02:46:50 -0800
Subject: [PATCH 226/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 295928284 Change-Id:
 I19b48cf57cf228db5da5cd393e153705af784fb3

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index c744d5b466a..f69affe5e8a 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21345,7 +21345,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22053,7 +22053,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22249,7 +22249,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22318,7 +22318,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22433,7 +22433,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22492,7 +22492,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22666,7 +22666,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22857,7 +22857,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25297,7 +25297,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25629,7 +25629,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25679,7 +25679,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25929,7 +25929,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26559,7 +26559,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27624,7 +27624,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45491,7 +45491,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From e07cc39fefdfb99058d1719e1c92c2d3a7ec4a45 Mon Sep 17 00:00:00 2001
From: Tom Forbes <tom@tomforb.es>
Date: Wed, 19 Feb 2020 11:37:45 +0000
Subject: [PATCH 227/442] Fix typo in tf.data.Dataset.list_files example code

This should be `Dataset`, not `dataset`.
---
 tensorflow/python/data/ops/dataset_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 799bfdfd490..6e48332dfdf 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -131,7 +131,7 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
   To create a dataset of all files matching a pattern, use
   `tf.data.Dataset.list_files`:
 
-  >>> dataset = tf.data.dataset.list_files("/path/*.txt")  # doctest: +SKIP
+  >>> dataset = tf.data.Dataset.list_files("/path/*.txt")  # doctest: +SKIP
 
   See `tf.data.FixedLengthRecordDataset` and `tf.data.Dataset.from_generator`
   for more ways to create datasets.

From 5dab22191d70d2dcd247d2d7b11628981c0a6f12 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 04:46:05 -0800
Subject: [PATCH 228/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 295942242 Change-Id:
 I6aae645e9774b5e4e65e3286384991242f76b57e

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index f69affe5e8a..c744d5b466a 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21345,7 +21345,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22053,7 +22053,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22249,7 +22249,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22318,7 +22318,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22433,7 +22433,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22492,7 +22492,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22666,7 +22666,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22857,7 +22857,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25297,7 +25297,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25629,7 +25629,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25679,7 +25679,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25929,7 +25929,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26559,7 +26559,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27624,7 +27624,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45491,7 +45491,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 768717c917355536c70344a6c53961234826d2d2 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Wed, 19 Feb 2020 06:34:07 -0800
Subject: [PATCH 229/442] [MLIR][XLA] Expose parameters of LhloFuseLinalg pass
 using llvm flags.

Adds flags:
"tile-to-parallel-loops-for-linalg-fusion": "Tiles GenericOp consumer to parallel loops before linalg fusion"
"tile-sizes-for-linalg-fusion": "Tile sizes by which to tile linalg generic before linalg fusion"),

PiperOrigin-RevId: 295955774
Change-Id: Ia0aa12821d19b1710668d3336dc1278e02411ee5
---
 .../mlir/xla/tests/lhlo-fuse-linalg.mlir      | 97 ++++++++++++++-----
 .../mlir/xla/transforms/lhlo_fuse_linalg.cc   | 37 ++++++-
 2 files changed, 106 insertions(+), 28 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
index 7f9e8c19780..a9ffc116392 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
@@ -1,32 +1,57 @@
-// RUN: tf-opt -lhlo-fuse-linalg %s -o - | FileCheck %s
+// RUN: tf-opt -lhlo-fuse-linalg %s -o - | FileCheck %s --dump-input=always
+// RUN: tf-opt -lhlo-fuse-linalg -tile-sizes-for-linalg-fusion=2,3 %s -o - | FileCheck %s -check-prefix=TILED --dump-input-on-failure
+// RUN: tf-opt -lhlo-fuse-linalg -tile-to-parallel-loops-for-linalg-fusion %s -o - | FileCheck %s -check-prefix=PLOOP --dump-input-on-failure
+
 
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
 #pointwise_2d_trait = {args_in = 2, args_out = 1, indexing_maps = [#map0, #map0, #map0], iterator_types = ["parallel", "parallel"]}
-func @fusion(%multiplier: memref<2x2xf32>, %summand_1: memref<2x2xf32>,
-             %summand_2: memref<2x2xf32>, %result: memref<2x2xf32>) {
-  %temp_result = alloc() {temp = true} : memref<2x2xf32>
+func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
+             %summand_2: memref<6x6xf32>, %result: memref<6x6xf32>) {
+  %temp_result = alloc() {temp = true} : memref<6x6xf32>
   linalg.generic #pointwise_2d_trait %summand_1, %summand_2, %temp_result {
   ^bb0(%summand_1_in: f32, %summand_2_in: f32, %temp_result_in: f32):
     %out = addf %summand_1_in, %summand_2_in : f32
     linalg.yield %out : f32
-  } : memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>
+  } : memref<6x6xf32>, memref<6x6xf32>, memref<6x6xf32>
   linalg.generic #pointwise_2d_trait %temp_result, %multiplier, %result {
   ^bb0(%temp_result_in: f32, %multiplier_in: f32, %result_in: f32):
     %out = mulf %temp_result_in, %multiplier_in : f32
     linalg.yield %out : f32
-  } : memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>
-  dealloc %temp_result : memref<2x2xf32>
+  } : memref<6x6xf32>, memref<6x6xf32>, memref<6x6xf32>
+  dealloc %temp_result : memref<6x6xf32>
   "xla_lhlo.terminator"() : () -> ()
 }
 // CHECK-LABEL: func @fusion
-//   CHECK-NOT:   linalg.generic
-//       CHECK:   loop.for
-//       CHECK:     loop.for
-//   CHECK-NOT:   loop.for
-//       CHECK:       linalg.generic
-//       CHECK:         addf
-//       CHECK:       linalg.generic
-//       CHECK:         mulf
+//       CHECK:  %[[C1:.*]] = constant 1
+//   CHECK-NOT:  linalg.generic
+//       CHECK:  loop.for {{.*}} step %[[C1]]
+//       CHECK:    loop.for {{.*}} step %[[C1]]
+//   CHECK-NOT:  loop.for
+//       CHECK:      linalg.generic
+//       CHECK:        addf
+//       CHECK:      linalg.generic
+//       CHECK:        mulf
+
+// TILED-LABEL: func @fusion
+//   TILED-DAG:  %[[C2:.*]] = constant 2
+//   TILED-DAG:  %[[C3:.*]] = constant 3
+//   TILED-NOT:  linalg.generic
+//       TILED:  loop.for {{.*}} step %[[C2]]
+//       TILED:    loop.for {{.*}} step %[[C3]]
+//   TILED-NOT:  loop.for
+//       TILED:      linalg.generic
+//       TILED:        addf
+//       TILED:      linalg.generic
+//       TILED:        mulf
+
+// PLOOP-LABEL: func @fusion
+//   PLOOP-NOT:  linalg.generic
+//       PLOOP:  loop.parallel
+//   PLOOP-NOT:  loop.parallel
+//       PLOOP:      linalg.generic
+//       PLOOP:        addf
+//       PLOOP:      linalg.generic
+//       PLOOP:        mulf
 
 func @fusion_of_three(%arg0: memref<100x10xf32>,
                       %arg1: memref<100xf32>,
@@ -67,12 +92,36 @@ func @fusion_of_three(%arg0: memref<100x10xf32>,
  return
 }
 // CHECK-LABEL: func @fusion
-//   CHECK-NOT:   linalg.generic
-//       CHECK:   loop.for
-//       CHECK:     loop.for
-//   CHECK-NOT:   loop.for
-//       CHECK:       linalg.generic
-//       CHECK:       linalg.generic
-//       CHECK:         subf
-//       CHECK:       linalg.generic
-//       CHECK:         exp
+//       CHECK:  %[[C1:.*]] = constant 1
+//   CHECK-NOT:  linalg.generic
+//       CHECK:  loop.for {{.*}} step %[[C1]]
+//       CHECK:    loop.for {{.*}} step %[[C1]]
+//   CHECK-NOT:  loop.for
+//       CHECK:      linalg.generic
+//       CHECK:      linalg.generic
+//       CHECK:        subf
+//       CHECK:      linalg.generic
+//       CHECK:        exp
+
+// TILED-LABEL: func @fusion_of_three
+//   TILED-DAG:   %[[C2:.*]] = constant 2
+//   TILED-DAG:   %[[C3:.*]] = constant 3
+//   TILED-NOT:   linalg.generic
+//       TILED:   loop.for {{.*}} step %[[C2]]
+//       TILED:     loop.for {{.*}} step %[[C3]]
+//   TILED-NOT:   loop.for
+//       TILED:       linalg.generic
+//       TILED:       linalg.generic
+//       TILED:         subf
+//       TILED:       linalg.generic
+//       TILED:         exp
+
+// PLOOP-LABEL: func @fusion_of_three
+//   PLOOP-NOT:   linalg.generic
+//       PLOOP:   loop.parallel
+//   PLOOP-NOT:   loop.parallel
+//       PLOOP:       linalg.generic
+//       PLOOP:       linalg.generic
+//       PLOOP:         subf
+//       PLOOP:       linalg.generic
+//       PLOOP:         exp
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
index b5e33fb0663..6b2b548550a 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
@@ -22,6 +22,20 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // TF:llvm-project
 #include "mlir/Transforms/FoldUtils.h"  // TF:llvm-project
 
+// NOLINTNEXTLINE
+static llvm::cl::opt<bool> tile_to_parallel_loops_for_linalg_fusion(
+    "tile-to-parallel-loops-for-linalg-fusion",
+    llvm::cl::desc(
+        "Tiles GenericOp consumer to parallel loops before linalg fusion"),
+    llvm::cl::init(false));
+
+// NOLINTNEXTLINE
+static llvm::cl::list<unsigned> tile_sizes_for_linalg_fusion(
+    "tile-sizes-for-linalg-fusion",
+    llvm::cl::desc(
+        "Tile sizes by which to tile linalg generic before linalg fusion"),
+    llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated);
+
 namespace mlir {
 namespace xla_lhlo {
 namespace {
@@ -50,13 +64,16 @@ struct LhloFuseLinalg : public FunctionPass<LhloFuseLinalg> {
     OpBuilder b(func);
     OperationFolder folder(func.getContext());
     func.walk([&](linalg::GenericOp generic_op) {
-      const SmallVector<int64_t, 2> tile_sizes(
-          generic_op.getNumInputsAndOutputs(), 1);
+      SmallVector<int64_t, 2> tile_sizes(tile_sizes_for_linalg_fusion.begin(),
+                                         tile_sizes_for_linalg_fusion.end());
+      if (tile_sizes.empty()) {
+        tile_sizes =
+            SmallVector<int64_t, 2>(generic_op.getNumInputsAndOutputs(), 1);
+      }
       auto op = cast<LinalgOp>(generic_op.getOperation());
       for (const Value result : op.getOutputBuffers()) {
         if (!func_args.count(result)) continue;
-        if (linalg::tileLinalgOp(b, op, tile_sizes, /*permutation=*/{},
-                                 &folder)) {
+        if (tileGenericOp(op, tile_sizes, &b, &folder)) {
           generic_op.erase();
           return;
         }
@@ -83,6 +100,18 @@ struct LhloFuseLinalg : public FunctionPass<LhloFuseLinalg> {
     }
     for (auto* e : erase_set) e->erase();
   }
+
+ private:
+  bool tileGenericOp(LinalgOp op, ArrayRef<int64_t> tile_sizes, OpBuilder* b,
+                     OperationFolder* folder) {
+    auto tiled_generic_op =
+        tile_to_parallel_loops_for_linalg_fusion
+            ? linalg::tileLinalgOpToParallelLoops(*b, op, tile_sizes,
+                                                  /*permutation=*/{}, folder)
+            : linalg::tileLinalgOp(*b, op, tile_sizes,
+                                   /*permutation=*/{}, folder);
+    return tiled_generic_op.hasValue();
+  }
 };
 
 }  // namespace

From 28046a55b72fd5b49879259414daa998015b34c8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 06:46:42 -0800
Subject: [PATCH 230/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 295957366 Change-Id:
 I21c518e84fc4815c87709ca7a80a925ad2e2a7bf

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index c744d5b466a..f69affe5e8a 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21345,7 +21345,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22053,7 +22053,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22249,7 +22249,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22318,7 +22318,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22433,7 +22433,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22492,7 +22492,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22666,7 +22666,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22857,7 +22857,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25297,7 +25297,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25629,7 +25629,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25679,7 +25679,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25929,7 +25929,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26559,7 +26559,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27624,7 +27624,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45491,7 +45491,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 911d4a618ababfa073b87c49a2ab05418b565d4b Mon Sep 17 00:00:00 2001
From: Stefano Galarraga <galarragas@google.com>
Date: Wed, 19 Feb 2020 07:21:11 -0800
Subject: [PATCH 231/442] Expose option to limit the number of partitions that
 will be delegated to NNAPI

PiperOrigin-RevId: 295962456
Change-Id: I43e13700e23b798ce786b7f1034066961c4c3613
---
 .../tensorflow/lite/nnapi/NnApiDelegate.java  |  43 ++--
 .../src/main/native/nnapi_delegate_jni.cc     |   6 +-
 .../lite/delegates/nnapi/nnapi_delegate.cc    | 166 ++++++++++----
 .../lite/delegates/nnapi/nnapi_delegate.h     |  43 +++-
 .../nnapi_delegate_device_selection_test.cc   | 211 +++++++++++++++++-
 5 files changed, 409 insertions(+), 60 deletions(-)

diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java b/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java
index 91299d7707f..989cb2c1480 100644
--- a/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java
+++ b/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java
@@ -65,24 +65,35 @@ public class NnApiDelegate implements Delegate, AutoCloseable {
     }
 
     public Options setAcceleratorName(String name) {
-      this.accelerator_name = name;
+      this.acceleratorName = name;
       return this;
     }
 
-    public Options setCacheDir(String name) {
-      this.cache_dir = name;
+    public Options setCacheDir(String cacheDir) {
+      this.cacheDir = cacheDir;
       return this;
     }
 
-    public Options setModelToken(String name) {
-      this.model_token = name;
+    public Options setModelToken(String modelToken) {
+      this.modelToken = modelToken;
       return this;
     }
 
-    int executionPreference = EXECUTION_PREFERENCE_UNDEFINED;
-    String accelerator_name = null;
-    String cache_dir = null;
-    String model_token = null;
+    /**
+     * Sets the maximum number of graph partitions that the delegate will try to delegate. If more
+     * partitions could be delegated than the limit, the ones with the larger number of nodes will
+     * be chosen. If unset it will use the NNAPI default limit.
+     */
+    public Options setMaxNumberOfDelegatedPartitions(int limit) {
+      this.maxDelegatedPartitions = limit;
+      return this;
+    }
+
+    private int executionPreference = EXECUTION_PREFERENCE_UNDEFINED;
+    private String acceleratorName = null;
+    private String cacheDir = null;
+    private String modelToken = null;
+    private Integer maxDelegatedPartitions = null;
   }
 
   public NnApiDelegate(Options options) {
@@ -91,9 +102,10 @@ public class NnApiDelegate implements Delegate, AutoCloseable {
     delegateHandle =
         createDelegate(
             options.executionPreference,
-            options.accelerator_name,
-            options.cache_dir,
-            options.model_token);
+            options.acceleratorName,
+            options.cacheDir,
+            options.modelToken,
+            options.maxDelegatedPartitions != null ? options.maxDelegatedPartitions : -1);
   }
 
   public NnApiDelegate() {
@@ -118,8 +130,13 @@ public class NnApiDelegate implements Delegate, AutoCloseable {
     }
   }
 
+  //
   private static native long createDelegate(
-      int preference, String device_name, String cache_dir, String model_token);
+      int preference,
+      String deviceName,
+      String cacheDir,
+      String modelToken,
+      int maxDelegatedPartitions);
 
   private static native void deleteDelegate(long delegateHandle);
 }
diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.cc b/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.cc
index 65d39b0a1de..d256faedd11 100644
--- a/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.cc
+++ b/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.cc
@@ -26,7 +26,7 @@ using namespace tflite;
 JNIEXPORT jlong JNICALL
 Java_org_tensorflow_lite_nnapi_NnApiDelegate_createDelegate(
     JNIEnv* env, jclass clazz, jint preference, jstring accelerator_name,
-    jstring cache_dir, jstring model_token) {
+    jstring cache_dir, jstring model_token, jint max_delegated_partitions) {
   StatefulNnApiDelegate::Options options = StatefulNnApiDelegate::Options();
   options.execution_preference =
       (StatefulNnApiDelegate::Options::ExecutionPreference)preference;
@@ -40,6 +40,10 @@ Java_org_tensorflow_lite_nnapi_NnApiDelegate_createDelegate(
     options.model_token = env->GetStringUTFChars(model_token, NULL);
   }
 
+  if (max_delegated_partitions >= 0) {
+    options.max_number_delegated_partitions = max_delegated_partitions;
+  }
+
   auto delegate = new StatefulNnApiDelegate(options);
 
   if (options.accelerator_name) {
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 0e074c8b70e..a3a4babd91f 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <functional>
 #include <initializer_list>
 #include <iostream>
+#include <iterator>
 #include <map>
 #include <memory>
 #include <string>
@@ -3850,6 +3851,8 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(const NnApi* nnapi,
     delegate_data_.model_token = options.model_token;
   }
   delegate_data_.disallow_nnapi_cpu = options.disallow_nnapi_cpu;
+  delegate_data_.max_number_delegated_partitions =
+      options.max_number_delegated_partitions;
   TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
                        "Created TensorFlow Lite delegate for NNAPI.");
   Prepare = DoPrepare;
@@ -3877,6 +3880,8 @@ const StatefulNnApiDelegate::Options StatefulNnApiDelegate::GetOptions(
                             ? nullptr
                             : delegate_data->model_token.c_str();
   options.disallow_nnapi_cpu = delegate_data->disallow_nnapi_cpu;
+  options.max_number_delegated_partitions =
+      delegate_data->max_number_delegated_partitions;
   return options;
 }
 
@@ -3943,6 +3948,110 @@ int StatefulNnApiDelegate::GetNnApiErrno() const {
 using ::tflite::delegate::nnapi::kMinSdkVersionForNNAPI;
 using ::tflite::delegate::nnapi::kMinSdkVersionForNNAPI12;
 
+namespace {
+
+std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> BuildTfLiteIntArray(
+    const std::vector<int>& data) {
+  std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> result(
+      TfLiteIntArrayCreate(data.size()));
+  std::copy(data.begin(), data.end(), result->data);
+  return result;
+}
+}  // namespace
+
+// static
+TfLiteStatus StatefulNnApiDelegate::GetNodesSupportedByAccelerator(
+    TfLiteContext* context, TfLiteDelegate* delegate, const NnApi* nnapi,
+    const std::vector<int>& supported_nodes,
+    std::vector<int>* device_supported_nodes, int* num_partitions,
+    TfLiteDelegateParams** params_array, int* nnapi_errno) {
+  auto* delegate_data = static_cast<Data*>(delegate->data_);
+  // The first entry in the array is the element count
+
+  auto supported_nodes_int_array = BuildTfLiteIntArray(supported_nodes);
+  TF_LITE_ENSURE_STATUS(context->PreviewDelegatePartitioning(
+      context, supported_nodes_int_array.get(), params_array, num_partitions));
+  // For each partition check if which nodes are actually supported by the
+  // target accelerators.
+  delegate_data->delegate_state_cache.clear();
+  for (int idx = 0; idx < *num_partitions; idx++) {
+    const auto& partition_params = (*params_array)[idx];
+    auto kernel_state = absl::make_unique<NNAPIDelegateKernel>(nnapi);
+    TfLiteDelegateParams params_with_delegate = partition_params;
+    params_with_delegate.delegate = delegate;
+    TF_LITE_ENSURE_STATUS(
+        kernel_state->Init(context, &params_with_delegate, nnapi_errno));
+    std::vector<int> supported_partition_nodes;
+    TF_LITE_ENSURE_STATUS(
+        kernel_state->GetOperationsSupportedByTargetNnApiDevices(
+            context, &supported_partition_nodes, nnapi_errno));
+    device_supported_nodes->insert(device_supported_nodes->end(),
+                                   supported_partition_nodes.begin(),
+                                   supported_partition_nodes.end());
+
+    bool model_fully_supported = (supported_partition_nodes.size() ==
+                                  partition_params.nodes_to_replace->size);
+    if (model_fully_supported) {
+      delegate_data->CacheDelegateKernel(&partition_params,
+                                         kernel_state.release());
+    }
+  }
+
+  if (device_supported_nodes->size() != supported_nodes.size()) {
+    // We changed the set of nodes to delegate this will create a different
+    // partitioning layout.
+    auto device_sup_nodes_int_array =
+        BuildTfLiteIntArray(*device_supported_nodes);
+    TF_LITE_ENSURE_STATUS(context->PreviewDelegatePartitioning(
+        context, device_sup_nodes_int_array.get(), params_array,
+        num_partitions));
+  }
+
+  return kTfLiteOk;
+}
+
+// static
+TfLiteStatus StatefulNnApiDelegate::LimitDelegatedPartitions(
+    int max_partitions,
+    std::vector<TfLiteDelegateParams> partition_params_array,
+    std::vector<int>* nodes_to_delegate) {
+  int num_partitions = partition_params_array.size();
+  if (max_partitions <= 0 || num_partitions <= max_partitions) {
+    return kTfLiteOk;
+  }
+
+  int number_delegated_partitions = std::count_if(
+      partition_params_array.begin(), partition_params_array.end(),
+      [nodes_to_delegate](const TfLiteDelegateParams& partition_params) {
+        return std::find(nodes_to_delegate->begin(), nodes_to_delegate->end(),
+                         partition_params.nodes_to_replace->data[0]) !=
+               nodes_to_delegate->end();
+      });
+
+  if (number_delegated_partitions > max_partitions) {
+    std::sort(partition_params_array.begin(), partition_params_array.end(),
+              [](const TfLiteDelegateParams& left,
+                 const TfLiteDelegateParams& right) -> bool {
+                // Reverse sort
+                return left.nodes_to_replace->size >
+                       right.nodes_to_replace->size;
+              });
+
+    nodes_to_delegate->clear();
+
+    for (int i = 0; i < max_partitions; i++) {
+      const TfLiteDelegateParams& partition_params = partition_params_array[i];
+
+      nodes_to_delegate->insert(nodes_to_delegate->end(),
+                                partition_params.nodes_to_replace->data,
+                                partition_params.nodes_to_replace->data +
+                                    partition_params.nodes_to_replace->size);
+    }
+  }
+
+  return kTfLiteOk;
+}
+
 TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
                                               TfLiteDelegate* delegate) {
   auto* delegate_data = static_cast<Data*>(delegate->data_);
@@ -3998,10 +4107,8 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
       }
     }
   }
-  // Allocate one element in vector already since TensorFlow Lite uses
-  // the first value as the number of nodes. The actual value will be set
-  // later, after the vector has been filled.
-  std::vector<int> supported_nodes(1);
+
+  std::vector<int> supported_nodes;
   // We don't care about all nodes_, we only care about ones in the
   // current plan.
   TfLiteIntArray* plan;
@@ -4021,11 +4128,9 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
       supported_nodes.push_back(node_index);
     }
   }
-  // First element in vector must be the number of actual nodes.
-  supported_nodes[0] = supported_nodes.size() - 1;
 
   // If there are no delegated nodes, short-circuit node replacement.
-  if (!supported_nodes[0]) {
+  if (supported_nodes.empty()) {
     return kTfLiteOk;
   }
 
@@ -4082,40 +4187,20 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
 
   std::vector<int>& nodes_to_delegate = supported_nodes;
   if (is_accelerator_specified) {
+    std::vector<int> device_supported_nodes;
+    int num_partitions;
     TfLiteDelegateParams* params_array;
-    int num_partitions = 0;
-    // The first entry in the array is the element count
-    std::vector<int> device_supported_nodes(1);
-    TF_LITE_ENSURE_STATUS(context->PreviewDelegatePartitioning(
-        context, reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()),
-        &params_array, &num_partitions));
-    // For each partition check if which nodes are actually supported by the
-    // target accelerators.
-    delegate_data->delegate_state_cache.clear();
-    for (int idx = 0; idx < num_partitions; idx++) {
-      const auto& partition_params = params_array[idx];
-      auto kernel_state = absl::make_unique<NNAPIDelegateKernel>(nnapi);
-      TfLiteDelegateParams params_with_delegate = partition_params;
-      params_with_delegate.delegate = delegate;
-      TF_LITE_ENSURE_STATUS(
-          kernel_state->Init(context, &params_with_delegate, nnapi_errno));
-      std::vector<int> supported_partition_nodes;
-      TF_LITE_ENSURE_STATUS(
-          kernel_state->GetOperationsSupportedByTargetNnApiDevices(
-              context, &supported_partition_nodes, nnapi_errno));
-      device_supported_nodes.insert(device_supported_nodes.end(),
-                                    supported_partition_nodes.begin(),
-                                    supported_partition_nodes.end());
 
-      bool model_fully_supported = (supported_partition_nodes.size() ==
-                                    partition_params.nodes_to_replace->size);
-      if (model_fully_supported) {
-        delegate_data->CacheDelegateKernel(&partition_params,
-                                           kernel_state.release());
-      }
-    }
+    TF_LITE_ENSURE_STATUS(GetNodesSupportedByAccelerator(
+        context, delegate, nnapi, supported_nodes, &device_supported_nodes,
+        &num_partitions, &params_array, nnapi_errno));
+
+    TF_LITE_ENSURE_STATUS(LimitDelegatedPartitions(
+        delegate_options.max_number_delegated_partitions,
+        std::vector<TfLiteDelegateParams>(params_array,
+                                          params_array + num_partitions),
+        &device_supported_nodes));
 
-    device_supported_nodes[0] = device_supported_nodes.size() - 1;
     nodes_to_delegate = device_supported_nodes;
   }
 
@@ -4124,9 +4209,10 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
   } else {
     // Request TFLite to partition the graph and make kernels
     // for each independent node sub set a new nnapi_delegate_kernel.
+    auto nodes_to_delegate_int_array = BuildTfLiteIntArray(nodes_to_delegate);
     return context->ReplaceNodeSubsetsWithDelegateKernels(
-        context, nnapi_delegate_kernel,
-        reinterpret_cast<TfLiteIntArray*>(nodes_to_delegate.data()), delegate);
+        context, nnapi_delegate_kernel, nodes_to_delegate_int_array.get(),
+        delegate);
   }
 }
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
index e0657c6e13b..423490438a9 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
@@ -80,6 +80,15 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // kernels, but allowing CPU allows partial acceleration of models. If this
     // is set to true, NNAPI is only used if the whole model is accelerated.
     bool disallow_nnapi_cpu = false;
+
+    // Specifies the max number of partitions to delegate. A value <= 0 means
+    // no limit.
+    // If the delegation of the full set of supported nodes would generate a
+    // number of partition greater than this parameter, only
+    // <max_number_delegated_partitions> of them will be actually accelerated.
+    // The selection is currently done sorting partitions in decreasing order
+    // of number of nodes and selecting them until the limit is reached.
+    int max_number_delegated_partitions = 0;
   };
 
   // Uses default options.
@@ -172,13 +181,17 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // The key is the index of the first node in the partition.
     // Couldn't use unique_ptr because of problems building on gcc
     std::unordered_map<int, NNAPIDelegateKernel*> delegate_state_cache;
+    // Maximum number of NNAPI partition to delegate. Zero or negative means
+    // no limit. Copied from StatefulNnApiDelegate::Options
+    int max_number_delegated_partitions;
 
     ~Data();
 
     // Caches an initialised NNAPIDelegateKernel.
     void CacheDelegateKernel(const TfLiteDelegateParams* delegate_params,
                              NNAPIDelegateKernel* delegate_state);
-    // Returns a cached NNAPIDelegateKernel if available.
+    // Returns a cached NNAPIDelegateKernel if available and removes it
+    // from the cache transferring the ownership to the caller.
     absl::optional<NNAPIDelegateKernel*> GetCachedDelegateKernel(
         const TfLiteDelegateParams* delegate_params);
   };
@@ -211,6 +224,34 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
                                  TfLiteDelegate* delegate,
                                  TfLiteBufferHandle* handle);
 
+  // Returns the nodes that can be delegated via NNAPI to the accelerator
+  // specified in the delegate options and information about the way the
+  // graph will be partitioned if the supported nodes will be delegated.
+  // Partition information is composed by the number of partitions and
+  // the delegate parameters associated to each partition.
+  // The method also caches in delegate->data the NNApiDelegateKernel instances
+  // that have been created during the device evaluation.
+  // All arguments are expected to be non-null.
+  static TfLiteStatus GetNodesSupportedByAccelerator(
+      TfLiteContext* context, TfLiteDelegate* delegate, const NnApi* nnapi,
+      const std::vector<int>& supported_nodes,
+      std::vector<int>* device_supported_nodes, int* num_partitions,
+      TfLiteDelegateParams** params_array, int* nnapi_errno);
+
+  // Alters the given array of nodes_to_delegate to limit the number of NNAPI
+  // owned partition to be less or equal than num_partitions. If num_partitions
+  // is less or equal to zero the input is left unaltered.
+  // The nodes_to_delegate array is expected to contain at element 0 the number
+  // of nodes to delegate and in remaining elements the set of nodes
+  // that would be delegated to NNAPI if this function wouldn't be
+  // called. It will be altered storing in the first element the count of
+  // nodes to actually delegate and in the remainder of the array the indexes.
+  // The params_array params might be altered during the functions execution.
+  static TfLiteStatus LimitDelegatedPartitions(
+      int max_partitions,
+      std::vector<TfLiteDelegateParams> partition_params_array,
+      std::vector<int>* nodes_to_delegate);
+
   // Delegate data presented through TfLiteDelegate::data_.
   Data delegate_data_;
 };
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
index eb9cad684a1..bf9e00bee69 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
@@ -14,6 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include <sys/mman.h>
 
+#include <algorithm>
+#include <iterator>
+#include <memory>
+#include <numeric>
+#include <ostream>
+#include <unordered_set>
+#include <vector>
+
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
@@ -223,18 +231,21 @@ class AcceleratedModel {
 
  protected:
   // build a delegate with a target accelerator name.
-  explicit AcceleratedModel(const NnApi* nnapi,
-                            const std::string& accelerator_name) {
+  AcceleratedModel(const NnApi* nnapi, const std::string& accelerator_name,
+                   int max_nnapi_partitions = 0) {
     StatefulNnApiDelegate::Options options;
     options.accelerator_name = accelerator_name.c_str();
+    options.max_number_delegated_partitions = max_nnapi_partitions;
     stateful_delegate_.reset(new StatefulNnApiDelegate(nnapi, options));
   }
 
   // build a delegate with no target accelerator name, can disable the NNAPI CPU
   // fallback implementation using the disallow_nnapi_cpu flag.
-  explicit AcceleratedModel(const NnApi* nnapi, bool disallow_nnapi_cpu) {
+  AcceleratedModel(const NnApi* nnapi, bool disallow_nnapi_cpu,
+                   int max_nnapi_partitions = 0) {
     StatefulNnApiDelegate::Options options;
     options.disallow_nnapi_cpu = disallow_nnapi_cpu;
+    options.max_number_delegated_partitions = max_nnapi_partitions;
     stateful_delegate_.reset(new StatefulNnApiDelegate(nnapi, options));
   }
 
@@ -305,8 +316,6 @@ TEST_F(UnsupportedOperationOnDeviceTest,
       << "Expected Max not to be delegates since it not supported before NNAPI "
          "1.2 and device declares to support only NNAPI 1.1.";
 
-  TFLITE_LOG_PROD(TFLITE_LOG_INFO, "First part of test done");
-
   nnapi_mock_->SetNnapiSupportedDevice("test-device", /* feature_level=*/29);
 
   ArgMaxOpModel m1({1, 1, 1, 4}, TensorType_FLOAT32, /*axis_value=*/3,
@@ -535,6 +544,198 @@ TEST_F(UnsupportedOperationOnDeviceTest, ShouldCacheModelCompilation) {
   EXPECT_EQ(should_cache_model_compilation_model_create_count, 1);
 }
 
+// Model with a chain of no-op (add with zero operations)
+class LongIdentityModel : public MultiOpModel, public AcceleratedModel {
+ public:
+  LongIdentityModel(const std::vector<int>& input_shape, int graph_size,
+                    const NnApi* nnapi, const std::string& accelerator_name,
+                    int max_nnapi_partitions)
+      : MultiOpModel(),
+        AcceleratedModel(nnapi, accelerator_name, max_nnapi_partitions) {
+    auto* delegate = GetDelegate();
+    this->SetApplyDelegate([delegate](Interpreter* interpreter) {
+      interpreter->ModifyGraphWithDelegate(delegate);
+    });
+
+    const TensorData tensor_data{TensorType_FLOAT32, input_shape};
+
+    input_ = AddInput(tensor_data);
+    zero_input_ = AddInput(tensor_data);
+
+    std::vector<int> intermediate_outputs(graph_size - 1);
+    std::generate(
+        std::begin(intermediate_outputs), std::end(intermediate_outputs),
+        [this, &tensor_data]() { return AddInnerTensor<float>(tensor_data); });
+
+    output_ = AddOutput(tensor_data);
+
+    AddBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
+                 CreateAddOptions(builder_).Union(), {input_, zero_input_},
+                 {intermediate_outputs[0]});
+
+    for (int i = 0; i < intermediate_outputs.size() - 1; i++) {
+      AddBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
+                   CreateAddOptions(builder_).Union(),
+                   {intermediate_outputs[i], zero_input_},
+                   {intermediate_outputs[i + 1]});
+    }
+
+    AddBuiltinOp(
+        BuiltinOperator_ADD, BuiltinOptions_AddOptions,
+        CreateAddOptions(builder_).Union(),
+        {intermediate_outputs[intermediate_outputs.size() - 1], zero_input_},
+        {output_});
+
+    BuildInterpreter({GetShape(input_), GetShape(zero_input_)});
+
+    std::vector<float> zero(GetTensorSize(input_), 0.0);
+    PopulateTensor(zero_input_, zero);
+  }
+
+  void SetInput(std::vector<float> value) { PopulateTensor(input_, value); }
+
+  int CountNnApiPartitions() {
+    return std::count_if(
+        std::begin(interpreter_->execution_plan()),
+        std::end(interpreter_->execution_plan()), [this](const int node_index) {
+          return interpreter_->node_and_registration(node_index)
+                     ->first.delegate != nullptr;
+        });
+  }
+
+ private:
+  int input_;
+  int zero_input_;
+  int output_;
+};
+
+class NodeFilter {
+ public:
+  void ConfigureSupportedNodes(
+      int graph_size, const std::unordered_set<int>& unsupported_indexes) {
+    graph_size_ = graph_size;
+    unsupported_indexes_ = unsupported_indexes;
+  }
+
+  void SetNodeSupport(bool* supported_ops) {
+    for (int i = 0; i < graph_size_; i++) {
+      supported_ops[i] = (unsupported_indexes_.count(i) == 0);
+    }
+  }
+
+ private:
+  int graph_size_;
+  std::unordered_set<int> unsupported_indexes_;
+};
+
+// Using the same node filter for all DelegatePartitionLimitTests
+// because StubGetSupportedOperationsForDevicesWith wants a C function.
+NodeFilter* DelegatePartitionLimitTestNodeFilter() {
+  static NodeFilter* node_filter = new NodeFilter();
+  return node_filter;
+}
+
+class DelegatePartitionLimitTest
+    : public ::tflite::delegate::nnapi::NnApiDelegateMockTest {
+ protected:
+  // Configure the underlying graph to generate a set of nnapi partition
+  // with the sizes specified in nnapi_partition_sizes and the given
+  // input_shape.
+  void Init(int max_nnapi_partitions,
+            const std::vector<int>& nnapi_partition_sizes,
+            const std::vector<int>& input_shape) {
+    // The graph will have as number of nodes the sum of nodes in the NNAPI
+    // partitions plus nnapi_partition_sizes.size() - 1 nodes that will be
+    // not supported by NNAPI and will cause the
+    graph_size_ = std::accumulate(std::begin(nnapi_partition_sizes),
+                                  std::end(nnapi_partition_sizes),
+                                  nnapi_partition_sizes.size() - 1);
+
+    std::unordered_set<int> unsupported_ops_idxs;
+    int partition_node_idx = -1;
+    for (int i = 0; i < nnapi_partition_sizes.size() - 1; i++) {
+      partition_node_idx += nnapi_partition_sizes[i] + 1;
+      unsupported_ops_idxs.insert(partition_node_idx);
+    }
+
+    DelegatePartitionLimitTestNodeFilter()->ConfigureSupportedNodes(
+        graph_size_, unsupported_ops_idxs);
+
+    nnapi_mock_->StubGetSupportedOperationsForDevicesWith(
+        [](const ANeuralNetworksModel* model,
+           const ANeuralNetworksDevice* const* devices, uint32_t num_devices,
+           bool* supported_ops) -> int {
+          DelegatePartitionLimitTestNodeFilter()->SetNodeSupport(supported_ops);
+          return ANEURALNETWORKS_NO_ERROR;
+        });
+
+    model_ = std::make_unique<LongIdentityModel>(
+        input_shape, graph_size_, nnapi_mock_->GetNnApi(),
+        /*accelerator_name=*/"test-device", max_nnapi_partitions);
+  }
+
+  std::unique_ptr<LongIdentityModel> model_;
+
+  int OriginalGraphSize() { return graph_size_; }
+
+ private:
+  int graph_size_;
+};
+
+TEST_F(DelegatePartitionLimitTest, ShouldDelegateOnePartitionOnly) {
+  Init(/*max_nnapi_partitions=*/1,
+       /*nnapi_partition_sizes=*/{3, 2},
+       /*input_shape=*/{1, 2, 2, 1});
+
+  EXPECT_EQ(model_->CountNnApiPartitions(), 1);
+}
+
+TEST_F(DelegatePartitionLimitTest,
+       ShouldDelegateAllPossiblePartitionsIfLimitIsZero) {
+  Init(/*max_nnapi_partitions=*/0,
+       /*nnapi_partition_sizes=*/{3, 2},
+       /*input_shape=*/{1, 2, 2, 1});
+
+  EXPECT_EQ(model_->CountNnApiPartitions(), 2);
+}
+
+TEST_F(DelegatePartitionLimitTest,
+       ShouldDelegateAllPossiblePartitionsIfLimitIsNegative) {
+  Init(/*max_nnapi_partitions=*/0,
+       /*nnapi_partition_sizes=*/{3, 2},
+       /*input_shape=*/{1, 2, 2, 1});
+
+  EXPECT_EQ(model_->CountNnApiPartitions(), 2);
+}
+
+TEST_F(DelegatePartitionLimitTest,
+       ShouldDelegateAllPossiblePartitionsIfBelowLimit) {
+  Init(/*max_nnapi_partitions=*/3,
+       /*nnapi_partition_sizes=*/{3, 2},
+       /*input_shape=*/{1, 2, 2, 1});
+
+  EXPECT_EQ(model_->CountNnApiPartitions(), 2);
+}
+
+TEST_F(DelegatePartitionLimitTest, ShouldDelegatePartitionWithHigherNodeCount) {
+  Init(/*max_nnapi_partitions=*/1,
+       /*nnapi_partition_sizes=*/{3, 2},
+       /*input_shape=*/{1, 2, 2, 1});
+
+  EXPECT_EQ(model_->CountNnApiPartitions(), 1);
+  EXPECT_EQ(model_->CountOpsExecutedByCpuKernel(), OriginalGraphSize() - 3);
+}
+
+TEST_F(DelegatePartitionLimitTest,
+       ShouldDelegatePartitionsWithHigherNodeCount) {
+  Init(/*max_nnapi_partitions=*/2,
+       /*nnapi_partition_sizes=*/{1, 5, 2, 4},
+       /*input_shape=*/{1, 2, 2, 1});
+
+  EXPECT_EQ(model_->CountNnApiPartitions(), 2);
+  EXPECT_EQ(model_->CountOpsExecutedByCpuKernel(), OriginalGraphSize() - 9);
+}
+
 }  // namespace
 }  // namespace tflite
 

From 876d602f5726fbef610944a54c0164440b1202d1 Mon Sep 17 00:00:00 2001
From: YoungSeok Yoon <youngseokyoon@google.com>
Date: Wed, 19 Feb 2020 07:46:08 -0800
Subject: [PATCH 232/442] Use --std=c++11 flag when targeting iOS platform

PiperOrigin-RevId: 295966567
Change-Id: I6cfb19f70228d8fcb42504430eb2e28beec7c2e7
---
 tensorflow/lite/tools/make/Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index c3280f0e62c..c010a38f924 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -68,6 +68,10 @@ ifeq ($(HOST_OS),windows)
 CXXFLAGS += -fext-numeric-literals -D__LITTLE_ENDIAN__
 endif
 
+ifeq ($(TARGET),ios)
+CXXFLAGS += --std=c++11
+endif
+
 # Auto-detect optimization opportunity if building natively.
 ifeq ($(HOST_OS),$(TARGET))
 ifeq ($(HOST_ARCH),$(TARGET_ARCH))

From 2a5df36b594399e002929f976b65c62293aa0fef Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Wed, 19 Feb 2020 08:25:20 -0800
Subject: [PATCH 233/442] Fix invalid reference due to vector resize.

When creating a new replicate op in the variable runtime reformatting pass, new_replicated_inputs holds references to replicated_inputs, but replicated_inputs could invalidate its elements due to resize.

Reserve enough space to avoid this problem.

PiperOrigin-RevId: 295974084
Change-Id: Ic4f36e90ab1807842ea73d9802b7b58b358b1c98
---
 .../tensorflow/transforms/tpu_variable_runtime_reformatting.cc   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
index 84ae3e735f2..e7bd44464d0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
@@ -263,6 +263,7 @@ tf_device::ReplicateOp AddInputsToReplicateOp(
   llvm::SmallVector<std::pair<llvm::ArrayRef<Value>, Type>, 8>
       new_replicated_inputs;
   llvm::SmallVector<llvm::SmallVector<Value, 8>, 8> replicated_inputs;
+  replicated_inputs.reserve(replicate.GetBody().getNumArguments());
   for (auto arg : llvm::enumerate(replicate.GetBody().getArguments())) {
     int64_t i = arg.index();
     replicated_inputs.emplace_back();

From 9456e895fc276b89e2f1f355ba32eecc78ea4d3e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 08:27:43 -0800
Subject: [PATCH 234/442] Add PopulationCount to the XLA Python client.

PiperOrigin-RevId: 295974574
Change-Id: I8cf982db9b213b530354e01ed982cfa128f80ce8
---
 tensorflow/compiler/xla/python/xla.cc             | 1 +
 tensorflow/compiler/xla/python/xla_client.py      | 1 +
 tensorflow/compiler/xla/python/xla_client_test.py | 6 ++++++
 3 files changed, 8 insertions(+)

diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index a8d4ccb7fd5..cf3441229f9 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -496,6 +496,7 @@ void BuildOpsSubmodule(py::module* m) {
 
 #define UNARY_OP(op) ops.def(#op, &op)
   UNARY_OP(Not);
+  UNARY_OP(PopulationCount);
   UNARY_OP(Clz);
   UNARY_OP(Abs);
   UNARY_OP(Exp);
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 65545306b0c..997343d2109 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -1635,6 +1635,7 @@ FftType = _xla.FftType
 
 _UNARY_OPS = [
     'Not',
+    'PopulationCount',
     'Clz',
     'Abs',
     'Exp',
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index a3a16f09ce6..de5ae258976 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -969,6 +969,12 @@ class SingleOpTest(ComputationTest):
     c.Not(c.Constant(arr))
     self._ExecuteAndCompareClose(c, expected=~arr)
 
+  def testPopulationCount(self):
+    c = self._NewComputation()
+    arr = NumpyArrayS32([3, 0, 1])
+    c.PopulationCount(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=np.array([2, 0, 1]))
+
   def testCountLeadingZeros(self):
     c = self._NewComputation()
     arr = NumpyArrayS32([0x7FFF, 0x12345678])

From 45a8e4c1d042909362fca50d767245665c754d1b Mon Sep 17 00:00:00 2001
From: Edward Loper <edloper@google.com>
Date: Wed, 19 Feb 2020 08:37:58 -0800
Subject: [PATCH 235/442] Automated rollback of commit
 ed493143b14c31ebf16881a815e8904e6a82ff9a

PiperOrigin-RevId: 295976413
Change-Id: I24d8fa6b6977fee4d2cb963259bb880775436bc6
---
 .../python/ops/ragged/ragged_getitem.py       |  85 ++++++++++++---
 .../python/ops/ragged/ragged_tensor_test.py   | 100 ++++++++++++++++--
 2 files changed, 164 insertions(+), 21 deletions(-)

diff --git a/tensorflow/python/ops/ragged/ragged_getitem.py b/tensorflow/python/ops/ragged/ragged_getitem.py
index eca3cc3cdfa..b380dae63c6 100644
--- a/tensorflow/python/ops/ragged/ragged_getitem.py
+++ b/tensorflow/python/ops/ragged/ragged_getitem.py
@@ -19,9 +19,12 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_gather_ops
@@ -41,9 +44,6 @@ def ragged_tensor_getitem(self, key):
   principles of Python ("In the face of ambiguity, refuse the temptation to
   guess"), we simply disallow this operation.
 
-  Any dimensions added by `array_ops.newaxis` will be ragged if the following
-  dimension is ragged.
-
   Args:
     self: The RaggedTensor to slice.
     key: Indicates which piece of the RaggedTensor to return, using standard
@@ -134,15 +134,27 @@ def _ragged_getitem(rt_input, key_list):
   # that puts all values in a single row.
   if row_key is array_ops.newaxis:
     inner_rt = _ragged_getitem(rt_input, inner_keys)
-    nsplits = array_ops.shape(inner_rt.row_splits,
-                              out_type=inner_rt.row_splits.dtype)[0]
-    return ragged_tensor.RaggedTensor.from_row_splits(
-        inner_rt, array_ops.stack([0, nsplits - 1]), validate=False)
+    nsplits = tensor_shape.dimension_at_index(inner_rt.row_splits.shape, 0)
+    if nsplits.value is not None:
+      nsplits = nsplits.value
+    else:
+      nsplits = array_ops.shape(inner_rt.row_splits,
+                                out_type=inner_rt.row_splits.dtype)[0]
+    return ragged_tensor.RaggedTensor.from_uniform_row_length(
+        inner_rt, nsplits - 1, nrows=1, validate=False)
 
   # Slicing a range of rows: first slice the outer dimension, and then
   # call `_ragged_getitem_inner_dimensions` to handle the inner keys.
   if isinstance(row_key, slice):
     sliced_rt_input = _slice_ragged_row_dimension(rt_input, row_key)
+    if rt_input.uniform_row_length is not None:
+      # If the inner dimension has uniform_row_length, then preserve it (by
+      # re-wrapping the values in a new RaggedTensor).  Note that the row
+      # length won't have changed, since we're slicing a range of rows (and not
+      # slicing the rows themselves).
+      sliced_rt_input = ragged_tensor.RaggedTensor.from_uniform_row_length(
+          sliced_rt_input.values, rt_input.uniform_row_length,
+          nrows=sliced_rt_input.nrows())
     return _ragged_getitem_inner_dimensions(sliced_rt_input, inner_keys)
 
   # Indexing a single row: slice values to get the indicated row, and then
@@ -245,11 +257,14 @@ def _ragged_getitem_inner_dimensions(rt_input, key_list):
   # RaggedTensor that puts each value in its own row.
   if column_key is array_ops.newaxis:
     inner_rt = _ragged_getitem_inner_dimensions(rt_input, key_list[1:])
-    nsplits = array_ops.shape(inner_rt.row_splits,
-                              out_type=inner_rt.row_splits.dtype)[0]
-    return ragged_tensor.RaggedTensor.from_row_splits(inner_rt,
-                                                      math_ops.range(nsplits),
-                                                      validate=False)
+    nsplits = tensor_shape.dimension_at_index(inner_rt.row_splits.shape, 0)
+    if nsplits.value is not None:
+      nsplits = nsplits.value
+    else:
+      nsplits = array_ops.shape(inner_rt.row_splits,
+                                out_type=inner_rt.row_splits.dtype)[0]
+    return ragged_tensor.RaggedTensor.from_uniform_row_length(
+        inner_rt, 1, nrows=nsplits - 1, validate=False)
 
   # Slicing a range of columns in a ragged inner dimension.  We use a
   # recursive call to process the values, and then assemble a RaggedTensor
@@ -292,15 +307,59 @@ def _ragged_getitem_inner_dimensions(rt_input, key_list):
             lambda: math_ops.maximum(limits + stop_offset, lower_bound))
       inner_rt = _build_ragged_tensor_from_value_ranges(
           inner_rt_starts, inner_rt_limits, column_key.step, rt_input.values)
+      # If the row dimension is uniform, then calculate the new
+      # uniform_row_length, and rebuild inner_rt using that uniform_row_lengths.
+      if rt_input.uniform_row_length is not None:
+        new_row_length = _slice_length(rt_input.uniform_row_length, column_key)
+        inner_rt = ragged_tensor.RaggedTensor.from_uniform_row_length(
+            inner_rt.values, new_row_length, rt_input.nrows())
       return inner_rt.with_values(
           _ragged_getitem_inner_dimensions(inner_rt.values, key_list[1:]))
 
   # Indexing a single column in a ragged inner dimension: raise an Exception.
   # See RaggedTensor.__getitem__.__doc__ for an explanation of why indexing
   # into a ragged inner dimension is problematic.
-  else:
+  if rt_input.uniform_row_length is None:
     raise ValueError("Cannot index into an inner ragged dimension.")
 
+  # Indexing a single column in a uniform inner dimension: check that the
+  # given index is in-bounds, and then use a strided slice over rt_input.values
+  # to take the indicated element from each row.
+  row_length = rt_input.uniform_row_length
+  column_key = math_ops.cast(column_key, row_length.dtype)
+  oob_err_msg = "Index out of bounds when indexing into a ragged tensor"
+  oob_checks = [
+      check_ops.assert_greater_equal(
+          column_key, -row_length, message=oob_err_msg),
+      check_ops.assert_less(column_key, row_length, message=oob_err_msg),
+  ]
+  with ops.control_dependencies(oob_checks):
+    offset = _if_ge_zero(column_key, lambda: column_key,
+                         lambda: row_length + column_key)
+    sliced_rt = rt_input.values[offset::row_length]
+    return _ragged_getitem_inner_dimensions(sliced_rt, key_list[1:])
+
+
+def _slice_length(value_length, slice_key):
+  """Computes the number of elements in a slice of a value with a given length.
+
+  Returns the equivalent of: `len(range(value_length)[slice_key])`
+
+  Args:
+    value_length: Scalar int `Tensor`: the length of the value being sliced.
+    slice_key: A `slice` object used to slice elements from the the value.
+
+  Returns:
+    The number of elements in the sliced value.
+  """
+  # Note: we could compute the slice length without creating a zeros tensor
+  # with some variant of (stop-start)//step, but doing so would require more
+  # ops (for checking bounds, handling negative indices, negative step sizes,
+  # etc); and we expect this to be an uncommon operation, so we use this
+  # simpler implementation.
+  zeros = array_ops.zeros(value_length, dtype=dtypes.bool)
+  return array_ops.size(zeros[slice_key], out_type=value_length.dtype)
+
 
 def _expand_ellipsis(key_list, num_remaining_dims):
   """Expands the ellipsis at the start of `key_list`.
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test.py b/tensorflow/python/ops/ragged/ragged_tensor_test.py
index 6bc066e5d84..f4c75d26699 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test.py
@@ -116,6 +116,12 @@ EXAMPLE_RAGGED_TENSOR_4D_VALUES = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10],
                                    [11, 12], [13, 14], [15, 16], [17, 18],
                                    [19, 20]]
 
+# Example 3D ragged tensor with uniform_row_lengths.
+EXAMPLE_RAGGED_TENSOR_3D = [[[1, 2, 3], [4], [5, 6]], [[], [7, 8, 9], []]]
+EXAMPLE_RAGGED_TENSOR_3D_ROWLEN = 3
+EXAMPLE_RAGGED_TENSOR_3D_SPLITS = [0, 3, 4, 6, 6, 9, 9]
+EXAMPLE_RAGGED_TENSOR_3D_VALUES = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+
 
 def int32array(values):
   return np.array(values, dtype=np.int32)
@@ -837,7 +843,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
   # RaggedTensor.__getitem__
   #=============================================================================
 
-  def _TestGetItem(self, rt, slice_spec, expected):
+  def _TestGetItem(self, rt, slice_spec, expected, expected_shape=None):
     """Helper function for testing RaggedTensor.__getitem__.
 
     Checks that calling `rt.__getitem__(slice_spec) returns the expected value.
@@ -855,6 +861,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
       slice_spec: The slice spec.
       expected: The expected value of rt.__getitem__(slice_spec), as a python
         list; or an exception class.
+      expected_shape: The expected shape for `rt.__getitem__(slice_spec)`.
     """
     tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
     tensor_slice_spec2 = _make_tensor_slice_spec(slice_spec, False)
@@ -864,13 +871,18 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     self.assertAllEqual(value1, expected, 'slice_spec=%s' % (slice_spec,))
     self.assertAllEqual(value2, expected, 'slice_spec=%s' % (slice_spec,))
     self.assertAllEqual(value3, expected, 'slice_spec=%s' % (slice_spec,))
+    if expected_shape is not None:
+      value1.shape.assert_is_compatible_with(expected_shape)
+      value2.shape.assert_is_compatible_with(expected_shape)
+      value3.shape.assert_is_compatible_with(expected_shape)
 
   def _TestGetItemException(self, rt, slice_spec, expected, message):
     """Helper function for testing RaggedTensor.__getitem__ exceptions."""
-    tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
-    self.assertRaisesRegexp(expected, message, rt.__getitem__, slice_spec)
-    self.assertRaisesRegexp(expected, message, rt.__getitem__,
-                            tensor_slice_spec1)
+    tensor_slice_spec = _make_tensor_slice_spec(slice_spec, True)
+    with self.assertRaisesRegexp(expected, message):
+      self.evaluate(rt.__getitem__(slice_spec))
+    with self.assertRaisesRegexp(expected, message):
+      self.evaluate(rt.__getitem__(tensor_slice_spec))
 
   @parameterized.parameters(
       # Tests for rt[i]
@@ -1225,12 +1237,84 @@ class RaggedTensorTest(test_util.TensorFlowTestCase,
     self.assertEqual(rt_newaxis3.ragged_rank, 2)
     self.assertEqual(rt_newaxis4.ragged_rank, 2)
 
-    self.assertEqual(rt_newaxis0.shape.as_list(), [1, None, None, None, 2])
-    self.assertEqual(rt_newaxis1.shape.as_list(), [2, None, None, None, 2])
-    self.assertEqual(rt_newaxis2.shape.as_list(), [2, None, None, None, 2])
+    self.assertEqual(rt_newaxis0.shape.as_list(), [1, 2, None, None, 2])
+    self.assertEqual(rt_newaxis1.shape.as_list(), [2, 1, None, None, 2])
+    self.assertEqual(rt_newaxis2.shape.as_list(), [2, None, 1, None, 2])
     self.assertEqual(rt_newaxis3.shape.as_list(), [2, None, None, 1, 2])
     self.assertEqual(rt_newaxis4.shape.as_list(), [2, None, None, 2, 1])
 
+  @parameterized.parameters(
+      # EXAMPLE_RAGGED_TENSOR_3D.shape = [2, 3, None]
+
+      # Indexing into uniform_row_splits dimension:
+      (SLICE_BUILDER[:, 1], [r[1] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, None]),
+      (SLICE_BUILDER[:, 2], [r[2] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, None]),
+      (SLICE_BUILDER[:, -2], [r[-2] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, None]),
+      (SLICE_BUILDER[:, -3], [r[-3] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, None]),
+      (SLICE_BUILDER[1:, 2], [r[2] for r in EXAMPLE_RAGGED_TENSOR_3D[1:]],
+       [1, None]),
+      (SLICE_BUILDER[:, 1, 1:], [r[1][1:] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, None]),
+      (SLICE_BUILDER[1:, 1, 1:],
+       [r[1][1:] for r in EXAMPLE_RAGGED_TENSOR_3D[1:]],
+       [1, None]),
+
+      # Slicing uniform_row_splits dimension:
+      (SLICE_BUILDER[:, 2:], [r[2:] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, 1, None]),
+      (SLICE_BUILDER[:, -2:], [r[-2:] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, 2, None]),
+      (SLICE_BUILDER[:, :, 1:],
+       [[c[1:] for c in r] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, 3, None]),
+      (SLICE_BUILDER[:, 5:], [r[5:] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, 0, None]),
+
+      # Slicing uniform_row_splits dimension with a non-default step size:
+      (SLICE_BUILDER[:, ::2], [r[::2] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, 2, None]),
+      (SLICE_BUILDER[:, ::-1], [r[::-1] for r in EXAMPLE_RAGGED_TENSOR_3D],
+       [2, 3, None]),
+  )
+  def testRaggedTensorGetItemWithUniformRowLength(self, slice_spec, expected,
+                                                  expected_shape):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    rt = RaggedTensor.from_uniform_row_length(
+        RaggedTensor.from_row_splits(
+            EXAMPLE_RAGGED_TENSOR_3D_VALUES,
+            EXAMPLE_RAGGED_TENSOR_3D_SPLITS),
+        EXAMPLE_RAGGED_TENSOR_3D_ROWLEN)
+    self.assertAllEqual(rt, EXAMPLE_RAGGED_TENSOR_3D)
+    self.assertIsNot(rt.uniform_row_length, None)
+    self._TestGetItem(rt, slice_spec, expected, expected_shape)
+
+    # If the result is 3D, then check that it still has a uniform row length:
+    actual = rt.__getitem__(slice_spec)
+    if actual.shape.rank == 3:
+      self.assertIsNot(actual.uniform_row_length, None)
+      self.assertAllEqual(actual.uniform_row_length, expected_shape[1])
+
+  @parameterized.parameters(
+      (SLICE_BUILDER[:, 3], errors.InvalidArgumentError, 'out of bounds'),
+      (SLICE_BUILDER[:, -4], errors.InvalidArgumentError, 'out of bounds'),
+      (SLICE_BUILDER[:, 10], errors.InvalidArgumentError, 'out of bounds'),
+      (SLICE_BUILDER[:, -10], errors.InvalidArgumentError, 'out of bounds'),
+  )
+  def testRaggedTensorGetItemErrorsWithUniformRowLength(self, slice_spec,
+                                                        expected, message):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    rt = RaggedTensor.from_uniform_row_length(
+        RaggedTensor.from_row_splits(
+            EXAMPLE_RAGGED_TENSOR_3D_VALUES,
+            EXAMPLE_RAGGED_TENSOR_3D_SPLITS),
+        EXAMPLE_RAGGED_TENSOR_3D_ROWLEN)
+    self.assertAllEqual(rt, EXAMPLE_RAGGED_TENSOR_3D)
+    self._TestGetItemException(rt, slice_spec, expected, message)
+
   #=============================================================================
   # RaggedTensor.__str__
   #=============================================================================

From e8fa1fa4067a7777b1f322e4488a596269f18f44 Mon Sep 17 00:00:00 2001
From: Vincent ABRIOU <vincent.abriou@st.com>
Date: Wed, 19 Feb 2020 17:43:09 +0100
Subject: [PATCH 236/442] TFLite: static library: fix benchmark build issue

Since the commit ee7642b2670e33a45cc3a6f6585cfab7f7d4f8f6, the benchmark
application is no more building due to the fact that some functions have
been moved.
Add profile_summary_formatter.cc in the PROFILE_SUMMARIZER_SRCS.

Signed-off-by: Vincent ABRIOU <vincent.abriou@st.com>
---
 tensorflow/lite/tools/make/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index c010a38f924..c1a20eccb0a 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -109,6 +109,7 @@ PROFILER_SRCS := \
 
 PROFILE_SUMMARIZER_SRCS := \
 	tensorflow/lite/profiling/profile_summarizer.cc \
+	tensorflow/lite/profiling/profile_summary_formatter.cc \
 	tensorflow/core/util/stats_calculator.cc
 
 CMD_LINE_TOOLS_SRCS := \

From 5bf57ef11af4528c0a3bcc9a63652955d2c5e97e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 08:47:19 -0800
Subject: [PATCH 237/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 295978060 Change-Id:
 If0efa7ed880f18575aa7788f877bd1c53c419e8d

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index f69affe5e8a..c744d5b466a 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21345,7 +21345,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22053,7 +22053,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22249,7 +22249,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22318,7 +22318,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22433,7 +22433,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22492,7 +22492,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22666,7 +22666,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22857,7 +22857,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25297,7 +25297,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25629,7 +25629,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25679,7 +25679,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25929,7 +25929,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26559,7 +26559,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27624,7 +27624,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45491,7 +45491,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 75cce895f56524e5514ce2cd0d300ab6c0a5b972 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 08:50:06 -0800
Subject: [PATCH 238/442] Add "Launch Activities" derived line. And add stats
 for each step , such as how many kernel/memcpy are launched, what's
 maximum/avg launch time.

PiperOrigin-RevId: 295978584
Change-Id: Ib2c418ecf034283613e960dd26ddf488ed5ba1bb
---
 tensorflow/core/profiler/utils/BUILD          |  2 +
 .../core/profiler/utils/derived_timeline.cc   | 97 +++++++++++++++++++
 .../core/profiler/utils/derived_timeline.h    |  5 +
 tensorflow/core/profiler/utils/trace_utils.h  | 11 ++-
 .../core/profiler/utils/xplane_utils.cc       |  8 ++
 tensorflow/core/profiler/utils/xplane_utils.h |  4 +
 .../core/profiler/utils/xplane_visitor.h      |  2 +-
 7 files changed, 123 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index fbf57be45c8..07d5598171e 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -273,6 +273,7 @@ cc_library(
         ":group_events",
         ":tf_op_utils",
         ":tf_xplane_visitor",
+        ":timespan",
         ":trace_utils",
         ":xplane_builder",
         ":xplane_schema",
@@ -281,6 +282,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/profiler/utils/derived_timeline.cc b/tensorflow/core/profiler/utils/derived_timeline.cc
index e4f0bd0f5af..ef9f308965b 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline.cc
@@ -14,11 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/utils/derived_timeline.h"
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/timespan.h"
 #include "tensorflow/core/profiler/utils/trace_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
@@ -113,6 +116,7 @@ const absl::string_view kDerivedLineTensorFlowNameScope =
 const absl::string_view kDerivedLineTensorFlowOps = "TensorFlow Ops";
 const absl::string_view kDerivedLineXlaModules = "XLA Modules";
 const absl::string_view kDerivedLineXlaOps = "XLA Ops";
+const absl::string_view kDerivedLineKernelLaunch = "Launch Stats";
 const absl::string_view kAnnotationDelimiter = "::";
 
 void ProcessTfOpEvent(const XEventVisitor& event,
@@ -231,6 +235,99 @@ void DeriveEventsFromAnnotations(const SymbolResolver& symbol_resolver,
   RemoveEmptyLines(device_trace);
 }
 
+void DeriveEventsFromHostTrace(const XPlane* host_trace,
+                               const EventGroupNameMap& event_group_name_map,
+                               std::vector<XPlane*> device_traces) {
+  struct GroupLaunchInfo {  // "Group" normally means step.
+    Timespan timespan;
+    int32 num_launches = 0;
+    uint64 max_launch_time_ps = 0ULL;
+    uint64 total_launch_time_ps = 0ULL;
+  };
+  typedef absl::flat_hash_map<uint64 /*group_id*/, GroupLaunchInfo>
+      DeviceLaunchInfo;
+
+  int num_devices = device_traces.size();
+  std::vector<DeviceLaunchInfo> per_device_launch_info(num_devices);
+
+  XPlaneVisitor host_plane = CreateTfXPlaneVisitor(host_trace);
+  host_plane.ForEachLine([&](const XLineVisitor& line) {
+    if (IsDerivedThreadId(line.Id())) return;
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      absl::optional<int64> group_id;
+      absl::optional<int64> device_id;
+      absl::optional<int64> correlation_id;
+      // Filter out API calls for cuEventRecord/cuEventQuery/cuCtxSynchronize
+      // etc for now. TODO: find a better way to filter out only the memcpy and
+      // kernel launch events.
+      if (absl::StartsWith(event.Name(), "cu")) return;
+      event.ForEachStat([&](const XStatVisitor& stat) {
+        if (stat.Type() == StatType::kGroupId) {
+          group_id = stat.IntValue();
+        } else if (stat.Type() == StatType::kDeviceId) {
+          device_id = stat.IntValue();
+        } else if (stat.Type() == StatType::kCorrelationId) {
+          correlation_id = stat.IntValue();
+        }
+      });
+      if (group_id && device_id && correlation_id && *device_id >= 0 &&
+          *device_id < num_devices) {
+        // This is a launch event on a known device.
+        GroupLaunchInfo& group_launch_info =
+            per_device_launch_info[*device_id][*group_id];
+        Timespan& group_span = group_launch_info.timespan;
+        Timespan event_span = event.GetTimespan();
+        if (group_launch_info.num_launches) {  // Existing group.
+          uint64 begin_ps =
+              std::min(group_span.begin_ps(), event_span.begin_ps());
+          uint64 end_ps = std::max(group_span.end_ps(), event_span.end_ps());
+          group_span = Timespan::FromEndPoints(begin_ps, end_ps);
+        } else {
+          group_span = event_span;
+        }
+        ++group_launch_info.num_launches;
+        group_launch_info.max_launch_time_ps = std::max(
+            group_launch_info.max_launch_time_ps, event_span.duration_ps());
+        group_launch_info.total_launch_time_ps += event_span.duration_ps();
+      }
+    });
+  });
+
+  uint64 host_plane_start = GetStartTimestampNs(*host_trace);
+  for (int i = 0; i < num_devices; ++i) {
+    if (per_device_launch_info[i].empty()) continue;
+    uint64 device_plane_start = GetStartTimestampNs(*device_traces[i]);
+    XPlaneBuilder device_plane(device_traces[i]);
+    XLineBuilder launch_line =
+        device_plane.GetOrCreateLine(kThreadIdKernelLaunch);
+    launch_line.SetName(kDerivedLineKernelLaunch);
+    launch_line.SetTimestampNs(std::min(device_plane_start, host_plane_start));
+    for (const auto& [group_id, group_info] : per_device_launch_info[i]) {
+      if (auto group_name = gtl::FindOrNull(event_group_name_map, group_id)) {
+        XEventBuilder device_event =
+            launch_line.AddEvent(*device_plane.GetOrCreateEventMetadata(
+                absl::StrCat("Launch Stats for ", *group_name)));
+        device_event.SetTimestampNs(
+            host_plane_start + PicosToNanos(group_info.timespan.begin_ps()));
+        device_event.SetDurationPs(group_info.timespan.duration_ps());
+        device_event.AddStatValue(*device_plane.GetOrCreateStatMetadata(
+                                      GetStatTypeStr(StatType::kGroupId)),
+                                  group_id);
+        device_event.AddStatValue(
+            *device_plane.GetOrCreateStatMetadata("num_launches"),
+            group_info.num_launches);
+        device_event.AddStatValue(
+            *device_plane.GetOrCreateStatMetadata("max_launch_time_us"),
+            PicosToMicros(group_info.max_launch_time_ps));
+        device_event.AddStatValue(
+            *device_plane.GetOrCreateStatMetadata("avg_launch_time_us"),
+            PicosToMicros(group_info.total_launch_time_ps /
+                          group_info.num_launches));
+      }
+    }
+  }
+}
+
 void GenerateDerivedTimeLines(const EventGroupNameMap& event_group_name_map,
                               XSpace* space) {
   // TODO(profiler): Once we capture HLO protos for xla/gpu, we should use that
diff --git a/tensorflow/core/profiler/utils/derived_timeline.h b/tensorflow/core/profiler/utils/derived_timeline.h
index 8b8a5ad9e35..5a99251a57c 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.h
+++ b/tensorflow/core/profiler/utils/derived_timeline.h
@@ -35,6 +35,11 @@ void DeriveEventsFromAnnotations(const SymbolResolver& symbol_resolver,
                                  const EventGroupNameMap& event_group_name_map,
                                  XPlane* device_trace);
 
+// Derives "Launch Activities Summary" line from host trace.
+void DeriveEventsFromHostTrace(const XPlane* host_trace,
+                               const EventGroupNameMap& event_group_name_map,
+                               std::vector<XPlane*> device_traces);
+
 // Loops through XPlanes of input XSpace, if it is "device" XPlane, generating
 // derived timelines for the plane by calling DeriveEventsFromAnnotations.
 void GenerateDerivedTimeLines(const EventGroupNameMap& event_group_name_map,
diff --git a/tensorflow/core/profiler/utils/trace_utils.h b/tensorflow/core/profiler/utils/trace_utils.h
index b6133bd360c..024330faa79 100644
--- a/tensorflow/core/profiler/utils/trace_utils.h
+++ b/tensorflow/core/profiler/utils/trace_utils.h
@@ -23,11 +23,12 @@ namespace profiler {
 // First derived stream/thread id.
 constexpr int kThreadIdDerivedMin = 0xdeadbeef;
 constexpr int kThreadIdStepInfo = kThreadIdDerivedMin;
-constexpr int kThreadIdTfNameScope = kThreadIdDerivedMin + 1;
-constexpr int kThreadIdTfOp = kThreadIdDerivedMin + 2;
-constexpr int kThreadIdHloModule = kThreadIdDerivedMin + 3;
-constexpr int kThreadIdHloOp = kThreadIdDerivedMin + 4;
-constexpr int kThreadIdOverhead = kThreadIdDerivedMin + 5;
+constexpr int kThreadIdKernelLaunch = kThreadIdDerivedMin + 1;
+constexpr int kThreadIdTfNameScope = kThreadIdDerivedMin + 2;
+constexpr int kThreadIdTfOp = kThreadIdDerivedMin + 3;
+constexpr int kThreadIdHloModule = kThreadIdDerivedMin + 4;
+constexpr int kThreadIdHloOp = kThreadIdDerivedMin + 5;
+constexpr int kThreadIdOverhead = kThreadIdDerivedMin + 6;
 
 // Last derived stream/thread id.
 constexpr int kThreadIdDerivedMax = kThreadIdOverhead;
diff --git a/tensorflow/core/profiler/utils/xplane_utils.cc b/tensorflow/core/profiler/utils/xplane_utils.cc
index 8b3012a5ea8..0bca0e39f7a 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.cc
+++ b/tensorflow/core/profiler/utils/xplane_utils.cc
@@ -278,5 +278,13 @@ void MergePlanes(const XPlane& src_plane, XPlane* dst_plane) {
   });
 }
 
+uint64 GetStartTimestampNs(const XPlane& plane) {
+  int64 plane_timestamp = 0;
+  for (const auto& line : plane.lines()) {
+    plane_timestamp = std::min<int64>(plane_timestamp, line.timestamp_ns());
+  }
+  return plane_timestamp;
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/xplane_utils.h b/tensorflow/core/profiler/utils/xplane_utils.h
index 787bc0eed0a..2a227f73dbb 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.h
+++ b/tensorflow/core/profiler/utils/xplane_utils.h
@@ -82,6 +82,10 @@ void NormalizeTimeLine(XSpace* space, uint64 start_time_ns);
 // events offset timestamp correspondingly.
 void MergePlanes(const XPlane& src_plane, XPlane* dst_plane);
 
+// Plane's start timestamp is defined as the minimum of all lines' start
+// timestamps. If zero line exists, return 0;
+uint64 GetStartTimestampNs(const XPlane& plane);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/utils/xplane_visitor.h b/tensorflow/core/profiler/utils/xplane_visitor.h
index a341d708d58..b0744810684 100644
--- a/tensorflow/core/profiler/utils/xplane_visitor.h
+++ b/tensorflow/core/profiler/utils/xplane_visitor.h
@@ -133,9 +133,9 @@ class XEventVisitor : public XStatsOwner<XEvent> {
 
   const XEventMetadata* metadata() const { return metadata_; }
 
- private:
   Timespan GetTimespan() const { return Timespan(TimestampPs(), DurationPs()); }
 
+ private:
   const XPlaneVisitor* plane_;
   const XLine* line_;
   const XEvent* event_;

From 0085ae0f6089bb04fc437eb79dd34bd0d19e3bda Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 19 Feb 2020 09:25:57 -0800
Subject: [PATCH 239/442] [SparseToDense] Fix benchmark reporting for the
 SparseToDense op.

Previously, we were using a concatenated benchmark arg for the size and rank, which appears to have led to name collisions when the benchmark reporter makes the benchmark name human readable. This change switches the benchmark to using an explicit `ArgPair` for those arguments, which reports them separately.

In addition, this change switches the benchmark to use a DT_INT64 indices tensor, which matches the common case for tf.SparseTensor usage in the TensorFlow API, and avoids a cast in the SparseToDenseOp implementation.

PiperOrigin-RevId: 295985981
Change-Id: I94df39e2ef64b387f4c158ef0c8b33153c3cca4c
---
 .../core/kernels/sparse_to_dense_op_test.cc   | 44 ++++++++-----------
 1 file changed, 19 insertions(+), 25 deletions(-)

diff --git a/tensorflow/core/kernels/sparse_to_dense_op_test.cc b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
index 2ed0b0948c3..84e1e09c219 100644
--- a/tensorflow/core/kernels/sparse_to_dense_op_test.cc
+++ b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
@@ -198,13 +198,7 @@ TEST_F(SparseToDenseTest, ThreeD_MultValues) {
 
 }  // namespace
 
-static int BM_Arg(int ndim, int n) { return (ndim * 1000000) + n; }
-static int NDIM_from_arg(int bm_arg) { return bm_arg / 1000000; }
-static int N_from_arg(int bm_arg) { return bm_arg % 1000000; }
-
-static void BM_SparseToDense(int iters, const int bm_arg) {
-  const int NDIM = NDIM_from_arg(bm_arg);
-  const int N = N_from_arg(bm_arg);
+static void BM_SparseToDense(int iters, int NDIM, int N) {
   // TODO(zhifengc): Switch to use kernel_benchmark_testlib.h
   tensorflow::testing::StopTiming();
 
@@ -217,7 +211,7 @@ static void BM_SparseToDense(int iters, const int bm_arg) {
 
   // Create a dense tensor with dims [1, ..., 1, N]
   Tensor output_shape(DT_INT32, TensorShape({NDIM}));
-  Tensor sparse_indices(DT_INT32, TensorShape({N, NDIM}));
+  Tensor sparse_indices(DT_INT64, TensorShape({N, NDIM}));
   Tensor sparse_values(DT_FLOAT, TensorShape({N}));
   Tensor default_value(DT_FLOAT, TensorShape({}));
   auto output_shape_t = output_shape.vec<int32>();
@@ -225,7 +219,7 @@ static void BM_SparseToDense(int iters, const int bm_arg) {
     output_shape_t(d) = (d == IndexDim) ? N : 3;
   }
 
-  auto sparse_indices_t = sparse_indices.matrix<int32>();
+  auto sparse_indices_t = sparse_indices.matrix<int64>();
   for (int n = 0; n < N; ++n) {
     for (int d = 0; d < NDIM; ++d)
       sparse_indices_t(n, d) = (d == IndexDim) ? n : 0;
@@ -274,21 +268,21 @@ static void BM_SparseToDense(int iters, const int bm_arg) {
 }
 
 BENCHMARK(BM_SparseToDense)
-    ->Arg(BM_Arg(1, 10))
-    ->Arg(BM_Arg(1, 100))
-    ->Arg(BM_Arg(1, 1000))
-    ->Arg(BM_Arg(1, 10000))
-    ->Arg(BM_Arg(2, 10))
-    ->Arg(BM_Arg(2, 100))
-    ->Arg(BM_Arg(2, 1000))
-    ->Arg(BM_Arg(2, 10000))
-    ->Arg(BM_Arg(3, 10))
-    ->Arg(BM_Arg(3, 100))
-    ->Arg(BM_Arg(3, 1000))
-    ->Arg(BM_Arg(3, 10000))
-    ->Arg(BM_Arg(5, 10))
-    ->Arg(BM_Arg(5, 100))
-    ->Arg(BM_Arg(5, 1000))
-    ->Arg(BM_Arg(5, 10000));
+    ->ArgPair(1, 10)
+    ->ArgPair(1, 100)
+    ->ArgPair(1, 1000)
+    ->ArgPair(1, 10000)
+    ->ArgPair(2, 10)
+    ->ArgPair(2, 100)
+    ->ArgPair(2, 1000)
+    ->ArgPair(2, 10000)
+    ->ArgPair(3, 10)
+    ->ArgPair(3, 100)
+    ->ArgPair(3, 1000)
+    ->ArgPair(3, 10000)
+    ->ArgPair(5, 10)
+    ->ArgPair(5, 100)
+    ->ArgPair(5, 1000)
+    ->ArgPair(5, 10000);
 
 }  // namespace tensorflow

From df36e873c5f0a8c532f7ff37500453bdabddeafa Mon Sep 17 00:00:00 2001
From: Guangda Lai <laigd@google.com>
Date: Wed, 19 Feb 2020 09:27:06 -0800
Subject: [PATCH 240/442] Fix convert_nodes.cc to support TRT7.

PiperOrigin-RevId: 295986213
Change-Id: I6221fca9278c02c2bb657b844980d2b2aef21a44
---
 tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 82bd8d4592f..433564513db 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -660,6 +660,9 @@ size_t TRT_ShapedWeights::size_bytes() const {
       data_type_size = 2;
       break;
     case nvinfer1::DataType::kINT8:
+#if IS_TRT_VERSION_GE(7, 0, 0, 0)
+    case nvinfer1::DataType::kBOOL:
+#endif
       data_type_size = 1;
       break;
   }

From 1dfd8dc2c437002a6ad98d1e8e5d87a870113787 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Wed, 19 Feb 2020 09:38:55 -0800
Subject: [PATCH 241/442] Avoid using hardcoded path in benchmark_test

/tmp does not exist on windows.

PiperOrigin-RevId: 295988695
Change-Id: Ie45e6311df617462f4ba10354fabbeaf2eb05127
---
 tensorflow/python/platform/benchmark_test.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/python/platform/benchmark_test.py b/tensorflow/python/platform/benchmark_test.py
index 17605984e70..2a395baae00 100644
--- a/tensorflow/python/platform/benchmark_test.py
+++ b/tensorflow/python/platform/benchmark_test.py
@@ -27,7 +27,7 @@ from tensorflow.python.platform import test
 class BenchmarkTest(test.TestCase, benchmark.TensorFlowBenchmark):
 
   def testReportBenchmark(self):
-    output_dir = '/tmp/'
+    output_dir = self.get_temp_dir() + os.path.sep
     os.environ['TEST_REPORT_FILE_PREFIX'] = output_dir
     proto_file_path = os.path.join(output_dir,
                                    'BenchmarkTest.testReportBenchmark')
@@ -80,4 +80,3 @@ class BenchmarkTest(test.TestCase, benchmark.TensorFlowBenchmark):
 
 if __name__ == '__main__':
   test.main()
-

From c40c5dfbd6f15108e41a268e81fdd6111720091f Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Wed, 19 Feb 2020 09:48:30 -0800
Subject: [PATCH 242/442] [TF wheel] Put tensorflow & third_party headers in
 the right directory (again).

Prior to this, headers meant to go into include/{third_party,tensorflow} were being
put into a tensorflow-xxx.data/purelib/include directory during bdist build.  Now they're
placed into tensorflow/include/ and the wheel installs them where they're expected.

PiperOrigin-RevId: 295990673
Change-Id: I0606c75780ae6cda93bc009d3aa5bf03e51e2734
---
 tensorflow/tools/pip_package/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 55972e1d4ca..d4e92700eac 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -143,7 +143,7 @@ class InstallCommand(InstallCommandBase):
 
   def finalize_options(self):
     ret = InstallCommandBase.finalize_options(self)
-    self.install_headers = os.path.join(self.install_purelib, 'tensorflow',
+    self.install_headers = os.path.join(self.install_platlib, 'tensorflow',
                                         'include')
     self.install_lib = self.install_platlib
     return ret

From 478ea62407e810a9e0e147ad1cb6d253dc0b782f Mon Sep 17 00:00:00 2001
From: Jakob Buchgraber <buchgr@google.com>
Date: Wed, 19 Feb 2020 09:49:55 -0800
Subject: [PATCH 243/442] Support remote repositories in TF_*_CONFIG_REPO
 environment variables

Currently TF_*_CONFIG_REPO environment variables point to checked in preconfig packages. After migrating to remote config they will point to remote repositories. The "config_repo_label" function ensures both ways continue to work.

PiperOrigin-RevId: 295990961
Change-Id: I7637ff5298893d4ee77354e9b48f87b8c328c301
---
 third_party/gpus/cuda_configure.bzl         |  7 ++++---
 third_party/gpus/rocm_configure.bzl         |  7 ++++---
 third_party/nccl/nccl_configure.bzl         |  5 +++--
 third_party/py/python_configure.bzl         |  3 ++-
 third_party/remote_config/common.bzl        | 21 +++++++++++++++++++++
 third_party/tensorrt/tensorrt_configure.bzl |  9 +++++----
 6 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 5dcdfdbad73..caf7cccfb9f 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -39,6 +39,7 @@ load(
 )
 load(
     "//third_party/remote_config:common.bzl",
+    "config_repo_label",
     "err_out",
     "execute",
     "get_bash_bin",
@@ -1156,17 +1157,17 @@ def _create_remote_cuda_repository(repository_ctx, remote_config_repo):
     )
     repository_ctx.template(
         "cuda/BUILD",
-        Label(remote_config_repo + "/cuda:BUILD"),
+        config_repo_label(remote_config_repo, "cuda:BUILD"),
         {},
     )
     repository_ctx.template(
         "cuda/build_defs.bzl",
-        Label(remote_config_repo + "/cuda:build_defs.bzl"),
+        config_repo_label(remote_config_repo, "cuda:build_defs.bzl"),
         {},
     )
     repository_ctx.template(
         "cuda/cuda/cuda_config.h",
-        Label(remote_config_repo + "/cuda:cuda/cuda_config.h"),
+        config_repo_label(remote_config_repo, "cuda:cuda/cuda_config.h"),
         {},
     )
 
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 063271b83f2..e26e9b485b1 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -21,6 +21,7 @@ load(
 )
 load(
     "//third_party/remote_config:common.bzl",
+    "config_repo_label",
     "err_out",
     "execute",
     "files_exist",
@@ -797,17 +798,17 @@ def _create_remote_rocm_repository(repository_ctx, remote_config_repo):
     )
     repository_ctx.template(
         "rocm/BUILD",
-        Label(remote_config_repo + "/rocm:BUILD"),
+        config_repo_label(remote_config_repo, "rocm:BUILD"),
         {},
     )
     repository_ctx.template(
         "rocm/build_defs.bzl",
-        Label(remote_config_repo + "/rocm:build_defs.bzl"),
+        config_repo_label(remote_config_repo, "rocm:build_defs.bzl"),
         {},
     )
     repository_ctx.template(
         "rocm/rocm/rocm_config.h",
-        Label(remote_config_repo + "/rocm:rocm/rocm_config.h"),
+        config_repo_label(remote_config_repo, "rocm:rocm/rocm_config.h"),
         {},
     )
 
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index 4081ec156d5..f05ef7e7a6e 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -19,6 +19,7 @@ load(
 )
 load(
     "//third_party/remote_config:common.bzl",
+    "config_repo_label",
     "get_cpu_value",
     "get_host_environ",
 )
@@ -116,7 +117,7 @@ def _create_local_nccl_repository(repository_ctx):
 def _create_remote_nccl_repository(repository_ctx, remote_config_repo):
     repository_ctx.template(
         "BUILD",
-        Label(remote_config_repo + ":BUILD"),
+        config_repo_label(remote_config_repo, ":BUILD"),
         {},
     )
 
@@ -124,7 +125,7 @@ def _create_remote_nccl_repository(repository_ctx, remote_config_repo):
     if nccl_version == "":
         repository_ctx.template(
             "build_defs.bzl",
-            Label(remote_config_repo + ":build_defs.bzl"),
+            config_repo_label(remote_config_repo, ":build_defs.bzl"),
             {},
         )
 
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index bbeaa46f332..a82839c556c 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -13,6 +13,7 @@ load(
     "PYTHON_LIB_PATH",
     "TF_PYTHON_CONFIG_REPO",
     "auto_config_fail",
+    "config_repo_label",
     "execute",
     "get_bash_bin",
     "get_host_environ",
@@ -249,7 +250,7 @@ def _create_local_python_repository(repository_ctx):
 def _create_remote_python_repository(repository_ctx, remote_config_repo):
     """Creates pointers to a remotely configured repo set up to build with Python.
     """
-    repository_ctx.template("BUILD", Label(remote_config_repo + ":BUILD"), {})
+    repository_ctx.template("BUILD", config_repo_label(remote_config_repo, ":BUILD"), {})
 
 def _python_autoconf_impl(repository_ctx):
     """Implementation of the python_autoconf repository rule."""
diff --git a/third_party/remote_config/common.bzl b/third_party/remote_config/common.bzl
index 353e9bb1a63..140cd222e43 100644
--- a/third_party/remote_config/common.bzl
+++ b/third_party/remote_config/common.bzl
@@ -282,3 +282,24 @@ def err_out(result):
     if len(result.stderr) == 0:
         return result.stdout
     return result.stderr
+
+def config_repo_label(config_repo, target):
+    """Construct a label from config_repo and target.
+
+    This function exists to ease the migration from preconfig to remote config. In preconfig
+    the TF_*_CONFIG_REPO environ variables are set to packages in the main repo while in
+    remote config they will point to remote repositories.
+
+    Args:
+      config_repo: a remote repository or package.
+      target: a target
+    Returns:
+      A label constructed from config_repo and target.
+    """
+    if config_repo.startswith("@") and not config_repo.find("//") > 0:
+        # remote config is being used.
+        return Label(config_repo + "//" + target)
+    elif target.startswith(":"):
+        return Label(config_repo + target)
+    else:
+        return Label(config_repo + "/" + target)
diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
index 484a85649d9..f08ded2fee4 100644
--- a/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -14,6 +14,7 @@ load(
 )
 load(
     "//third_party/remote_config:common.bzl",
+    "config_repo_label",
     "get_cpu_value",
     "get_host_environ",
 )
@@ -153,20 +154,20 @@ def _tensorrt_configure_impl(repository_ctx):
     if get_host_environ(repository_ctx, _TF_TENSORRT_CONFIG_REPO) != None:
         # Forward to the pre-configured remote repository.
         remote_config_repo = repository_ctx.os.environ[_TF_TENSORRT_CONFIG_REPO]
-        repository_ctx.template("BUILD", Label(remote_config_repo + ":BUILD"), {})
+        repository_ctx.template("BUILD", config_repo_label(remote_config_repo, ":BUILD"), {})
         repository_ctx.template(
             "build_defs.bzl",
-            Label(remote_config_repo + ":build_defs.bzl"),
+            config_repo_label(remote_config_repo, ":build_defs.bzl"),
             {},
         )
         repository_ctx.template(
             "tensorrt/include/tensorrt_config.h",
-            Label(remote_config_repo + ":tensorrt/include/tensorrt_config.h"),
+            config_repo_label(remote_config_repo, ":tensorrt/include/tensorrt_config.h"),
             {},
         )
         repository_ctx.template(
             "LICENSE",
-            Label(remote_config_repo + ":LICENSE"),
+            config_repo_label(remote_config_repo, ":LICENSE"),
             {},
         )
         return

From 8e58b059efd7aed1b4bc1e2403f296f8c1d60f1d Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Wed, 19 Feb 2020 09:51:36 -0800
Subject: [PATCH 244/442] Update test to check for Float 32 check for
 RandomUniform legalize

PiperOrigin-RevId: 295991291
Change-Id: Ieb1dc23915560bb33d7b036428d0d6bbd81c28ac
---
 tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index 662e9fd642e..408975586d6 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -1400,9 +1400,10 @@ func @random_uniform_no_fold2(%arg0: tensor<2xi32>) -> tensor<*xf32> {
   // CHECK: %[[RANDOM:.*]] = "tf.RandomUniform"
 }
 
-func @random_uniform_no_fold3(%arg0: tensor<2xi32>) -> tensor<*xf64> {
-  %1 = "tf.RandomUniform"(%arg0) { seed = 1, seed2 = 2} : (tensor<2xi32>) -> tensor<*xf64>
-  return %1 : tensor<*xf64>
+func @random_uniform_no_fold3() -> tensor<2x5xf64> {
+  %0 = "tf.Const"() { value = dense<[2, 5]> : tensor<2xi32> } : () -> tensor<2xi32>
+  %1 = "tf.RandomUniform"(%0) { seed = 1, seed2 = 0} : (tensor<2xi32>) -> tensor<2x5xf64>
+  return %1 : tensor<2x5xf64>
 
   // CHECK-LABEL: random_uniform_no_fold3
   // CHECK: %[[RANDOM:.*]] = "tf.RandomUniform"

From eea407993125ebca71d5d237a29e0147165177a7 Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Wed, 19 Feb 2020 10:05:28 -0800
Subject: [PATCH 245/442] Remove values property from DistributedValues.

PiperOrigin-RevId: 295994651
Change-Id: Ic0d003c76e711bee12d5de563de902430e837d5e
---
 .../mirrored_function_strategy_test.py        |  7 ++--
 .../python/distribute/mirrored_strategy.py    |  2 +-
 .../distribute/mirrored_strategy_test.py      | 41 +++++++++++++++----
 .../distribute/mirrored_variable_test.py      |  6 ++-
 tensorflow/python/distribute/values.py        | 17 +++++---
 .../python/ops/stateful_random_ops_test.py    |  6 +--
 .../loss_scaling_gradient_tape_test.py        |  2 +-
 7 files changed, 56 insertions(+), 25 deletions(-)

diff --git a/tensorflow/python/distribute/mirrored_function_strategy_test.py b/tensorflow/python/distribute/mirrored_function_strategy_test.py
index 08e66b77933..aa40856f7a6 100644
--- a/tensorflow/python/distribute/mirrored_function_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_function_strategy_test.py
@@ -56,7 +56,6 @@ class MirroredFunctionStrategyTest(test.TestCase):
     self.assertLen(f_traces, 1)  # Function traced once, not for each replica.
     # Returns a per-replica value.
     self.assertIsInstance(result1, values.PerReplica)
-    self.assertAllEqual([1, 2], result1.values)
     self.assertAllEqual([1, 2],
                         self._strategy.experimental_local_results(result1))
 
@@ -64,7 +63,8 @@ class MirroredFunctionStrategyTest(test.TestCase):
     result2 = self._strategy.experimental_run_v2(f, args=(result1,))
     self.assertLen(f_traces, 1)
     self.assertIsInstance(result2, values.PerReplica)
-    self.assertAllEqual([1, 3], result2.values)
+    self.assertAllEqual([1, 3],
+                        self._strategy.experimental_local_results(result2))
 
   def testMergeCall(self):
     f_traces = []
@@ -94,7 +94,8 @@ class MirroredFunctionStrategyTest(test.TestCase):
     self.assertLen(g_traces, 1)
     # Returns a per-replica value.
     self.assertIsInstance(result, values.PerReplica)
-    self.assertAllEqual([1, 1], result.values)
+    self.assertAllEqual([1, 1],
+                        self._strategy.experimental_local_results(result))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index 630ae85ff97..e57c656139a 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -842,7 +842,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
 
   def _local_results(self, val):
     if isinstance(val, values.DistributedValues):
-      return val.values
+      return val._values  # pylint: disable=protected-access
     return (val,)
 
   def value_container(self, val):
diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py
index f1f693d30dc..d60d489c516 100644
--- a/tensorflow/python/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_strategy_test.py
@@ -356,7 +356,9 @@ class MirroredStrategyCallForEachReplicaTest(test.TestCase):
 
     with distribution.scope():
       result = distribution.extended.call_for_each_replica(model_fn)
-      self.assertEqual((0, 1), self.evaluate(result.values))
+      self.assertEqual(
+          (0, 1),
+          self.evaluate(distribution.experimental_local_results(result)))
       self.assertLen(traces, distribution.num_replicas_in_sync)
 
   def testFunctionInCallForEachReplicaInsideAnotherFunction(self, distribution):
@@ -372,7 +374,9 @@ class MirroredStrategyCallForEachReplicaTest(test.TestCase):
 
     with distribution.scope():
       result = step()
-      self.assertEqual((0, 1), self.evaluate(result.values))
+      self.assertEqual(
+          (0, 1),
+          self.evaluate(distribution.experimental_local_results(result)))
       self.assertLen(traces, distribution.num_replicas_in_sync)
 
   def testNestedFunctionInCallForEachReplicaWithMergeCall(self, distribution):
@@ -711,8 +715,14 @@ class MirroredVariableUpdateTest(test.TestCase):
       mirrored_var_result = self.evaluate(
           mirrored_var.assign_add(6.0, read_value=True))
       self.assertEqual(7.0, mirrored_var_result)
-      self.assertEqual(7.0, self.evaluate(mirrored_var.values[0]))
-      self.assertEqual(7.0, self.evaluate(mirrored_var.values[1]))
+      self.assertEqual(
+          7.0,
+          self.evaluate(
+              distribution.experimental_local_results(mirrored_var)[0]))
+      self.assertEqual(
+          7.0,
+          self.evaluate(
+              distribution.experimental_local_results(mirrored_var)[1]))
       self.assertEqual(
           distribution.extended.worker_devices[0], mirrored_var._devices[0])
       self.assertEqual(
@@ -720,8 +730,14 @@ class MirroredVariableUpdateTest(test.TestCase):
 
       # read_value == False
       self.evaluate(mirrored_var.assign_add(2.0, read_value=False))
-      self.assertEqual(9.0, self.evaluate(mirrored_var.values[0]))
-      self.assertEqual(9.0, self.evaluate(mirrored_var.values[1]))
+      self.assertEqual(
+          9.0,
+          self.evaluate(
+              distribution.experimental_local_results(mirrored_var)[0]))
+      self.assertEqual(
+          9.0,
+          self.evaluate(
+              distribution.experimental_local_results(mirrored_var)[1]))
       self.assertEqual(
           distribution.extended.worker_devices[0], mirrored_var._devices[0])
       self.assertEqual(
@@ -777,8 +793,14 @@ class MirroredVariableUpdateTest(test.TestCase):
       self.assertEqual(5.0, self.evaluate(mirrored_var))
       mirrored_var_result = self.evaluate(mirrored_var.assign_sub(2.0))
       self.assertEqual(3.0, mirrored_var_result)
-      self.assertEqual(3.0, self.evaluate(mirrored_var.values[0]))
-      self.assertEqual(3.0, self.evaluate(mirrored_var.values[1]))
+      self.assertEqual(
+          3.0,
+          self.evaluate(
+              distribution.experimental_local_results(mirrored_var)[0]))
+      self.assertEqual(
+          3.0,
+          self.evaluate(
+              distribution.experimental_local_results(mirrored_var)[1]))
       self.assertEqual(
           distribution.extended.worker_devices[0], mirrored_var._devices[0])
       self.assertEqual(
@@ -994,7 +1016,8 @@ class MirroredStrategyDefunTest(test.TestCase):
             distribution.extended.call_for_each_replica(
                 defun.get_concrete_function, args=[mock_model] + inputs))
         for i in range(len(devices)):
-          graph_function = per_replica_graph_functions.values[i]
+          graph_function = distribution.experimental_local_results(
+              per_replica_graph_functions)[i]
           # TODO(b/129555712): re-enable an assertion here that the two sets of
           # variables are the same.
           # self.assertEqual(set(graph_function.graph.variables),
diff --git a/tensorflow/python/distribute/mirrored_variable_test.py b/tensorflow/python/distribute/mirrored_variable_test.py
index f6ec7ccdc8d..0777bf3b42a 100644
--- a/tensorflow/python/distribute/mirrored_variable_test.py
+++ b/tensorflow/python/distribute/mirrored_variable_test.py
@@ -532,8 +532,10 @@ class MirroredVariableCreationTest(test.TestCase):
       expected_mean = 0.0
       for i, _ in enumerate(distribution.extended.worker_devices):
         # Should see different values on different devices.
-        v_sum_value = self.evaluate(ret_v_sum.values[i].read_value())
-        v_mean_value = self.evaluate(ret_v_mean.values[i].read_value())
+        v_sum_value = self.evaluate(
+            distribution.experimental_local_results(ret_v_sum)[i].read_value())
+        v_mean_value = self.evaluate(
+            distribution.experimental_local_results(ret_v_mean)[i].read_value())
         expected = i + 3.0
         self.assertEqual(expected, v_sum_value)
         expected_sum += expected
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 6210d51124b..baf3b8295dc 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -92,11 +92,6 @@ class DistributedValues(object):
     """Returns a representative component."""
     return self._values[0]
 
-  # TODO(josh11b): Replace experimental_local_results with this?
-  @property
-  def values(self):
-    return self._values
-
   @property
   def _devices(self):
     return tuple(v.device for v in self._values)
@@ -139,6 +134,11 @@ class DistributedDelegate(DistributedValues):
     # __getattr__ and @property. See b/120402273.
     return getattr(self._get(), name)
 
+  @property
+  def values(self):
+    """Returns the per replica values."""
+    return self._values
+
   def _get_as_operand(self):
     """Returns the value for operations for the current device.
 
@@ -272,6 +272,11 @@ class PerReplica(DistributedValues, composite_tensor.CompositeTensor):
     return PerReplicaSpec(
         *(type_spec.type_spec_from_value(v) for v in self._values))
 
+  @property
+  def values(self):
+    """Returns the per replica values."""
+    return self._values
+
 
 class PerReplicaSpec(type_spec.TypeSpec):
   """Type specification for a `PerReplica`."""
@@ -824,7 +829,7 @@ class MirroredVariable(DistributedVariable, Mirrored):
         if update_replica_id is not None:
           # We are calling an assign function on the mirrored variable in an
           # update context.
-          return f(self.values[update_replica_id], *args, **kwargs)
+          return f(self._values[update_replica_id], *args, **kwargs)
 
         # We are calling assign on the mirrored variable in cross replica
         # context, use `strategy.extended.update()` to update the variable.
diff --git a/tensorflow/python/ops/stateful_random_ops_test.py b/tensorflow/python/ops/stateful_random_ops_test.py
index 3526ab4cb3b..45c75cf1958 100644
--- a/tensorflow/python/ops/stateful_random_ops_test.py
+++ b/tensorflow/python/ops/stateful_random_ops_test.py
@@ -715,9 +715,9 @@ class StatefulRandomOpsTest(test.TestCase, parameterized.TestCase):
         return t
       results = strat.extended.call_for_each_replica(
           fn=f, args=gens)
-      values = results.values
-      self.assertAllEqual(2, len(values))
-      self.assertAllDifferent(values)
+      local_results = strat.experimental_local_results(results)
+      self.assertAllEqual(2, len(local_results))
+      self.assertAllDifferent(local_results)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py b/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py
index c1394a17307..74a1836f343 100644
--- a/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py
+++ b/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py
@@ -75,7 +75,7 @@ class LossScaleGradientTapeTest(test.TestCase, parameterized.TestCase):
 
     def convert_tensor_to_list(tensor):
       if isinstance(tensor, values.DistributedValues):
-        return tensor.values
+        return strategy.experimental_local_results(tensor)
       else:
         return [tensor]
     return nest.map_structure(convert_tensor_to_list, results)

From e0575253d11c6a57bc25ddfec09d4d2e1f2a47c1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 10:11:55 -0800
Subject: [PATCH 246/442] Provide an accessor for dynamic dimension inference.

PiperOrigin-RevId: 295996157
Change-Id: I50ea04cd692d1163b2e05d9f8e12dbeffc11fa3d
---
 tensorflow/compiler/xla/service/hlo_evaluator.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index fc9d42c1b17..803004225d2 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -133,6 +133,10 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
     dynamic_dimension_inference_ = dynamic_dimension_inference;
   }
 
+  DynamicDimensionInference* dynamic_dimension_inference() {
+    return dynamic_dimension_inference_;
+  }
+
   // Enable the fast path for certain operations like dot or convolution.
   void set_use_fast_path(bool value) { use_fast_path_ = value; }
 

From ca0bd89c9a00ae933617ecc98eedee105a29afb9 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Wed, 19 Feb 2020 10:15:08 -0800
Subject: [PATCH 247/442] Move the _TFBufferWrapper helper to a common place
 instead of in the tpu codebase.

PiperOrigin-RevId: 295996954
Change-Id: I2557fddd7cdc858ce661dd5a8c7bcd9996d519c6
---
 tensorflow/python/framework/c_api_util.py | 10 ++++++++++
 tensorflow/python/tpu/tpu.py              | 18 ++++--------------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/framework/c_api_util.py b/tensorflow/python/framework/c_api_util.py
index 101188293cd..ca493dd3623 100644
--- a/tensorflow/python/framework/c_api_util.py
+++ b/tensorflow/python/framework/c_api_util.py
@@ -97,6 +97,16 @@ class ScopedTFFunction(object):
       self.func = None
 
 
+class ScopedTFBuffer(object):
+  """An internal class to help manage the TF_Buffer lifetime."""
+
+  def __init__(self, buf_string):
+    self.buffer = c_api.TF_NewBufferFromString(compat.as_bytes(buf_string))
+
+  def __del__(self):
+    c_api.TF_DeleteBuffer(self.buffer)
+
+
 class ApiDefMap(object):
   """Wrapper around Tf_ApiDefMap that handles querying and deletion.
 
diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
index 0bd79f85604..96789d2cea5 100644
--- a/tensorflow/python/tpu/tpu.py
+++ b/tensorflow/python/tpu/tpu.py
@@ -27,11 +27,11 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf.tpu import dynamic_padding_pb2 as dynamic_padding
-from tensorflow.python.client import pywrap_tf_session
 from tensorflow.python.compiler.xla import xla
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import auto_control_deps
+from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import config
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
@@ -251,16 +251,6 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
   outside the replicated computation.
   """
 
-  class _TFBufferWrapper(object):
-    """An internal class to help manage the TF_Buffer lifetime."""
-
-    def __init__(self, buf_string):
-      self._buffer = pywrap_tf_session.TF_NewBufferFromString(
-          compat.as_bytes(buf_string))
-
-    def __del__(self):
-      pywrap_tf_session.TF_DeleteBuffer(self._buffer)
-
   def __init__(self, name, num_replicas, pivot):
     """Builds a new TPUReplicateContext.
 
@@ -285,7 +275,7 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     self._host_compute_core = []
     self._name = name
     self._name_as_bytes = compat.as_bytes(name)
-    self._tpu_relicate_attr_buf = self._TFBufferWrapper(
+    self._tpu_relicate_attr_buf = c_api_util.ScopedTFBuffer(
         attr_value_pb2.AttrValue(s=self._name_as_bytes).SerializeToString())
     self._unsupported_ops = []
     self._pivot = pivot
@@ -534,8 +524,8 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
         "_cloned" not in op.node_def.attr):
       raise ValueError("TPU computations cannot be nested on op (%s)" %
                        op)
-    op._set_attr_with_buf(
-        _TPU_REPLICATE_ATTR, self._tpu_relicate_attr_buf._buffer)
+    op._set_attr_with_buf(_TPU_REPLICATE_ATTR,
+                          self._tpu_relicate_attr_buf.buffer)
     if self._outside_compilation_cluster:
       op._set_attr(
           _OUTSIDE_COMPILATION_ATTR,

From 61b85f68db578ccf2318a8d394fea04dc74e58b1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 10:21:02 -0800
Subject: [PATCH 248/442] avoid CHECK in op_level_cost_estimator, silently
 fallback.

PiperOrigin-RevId: 295998356
Change-Id: Iee5c94376db3cd9d0a69a351eca615b73ec68be9
---
 .../grappler/costs/op_level_cost_estimator.cc   | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index ade9c7306d6..5bd2162b679 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -133,14 +133,15 @@ bool IsTraining(const OpInfo& op_info) {
   return false;
 }
 
-// TODO(dyoon): support non-4D tensors in the c ost functions of convolution
+// TODO(dyoon): support non-4D tensors in the cost functions of convolution
 // related ops (Conv, Pool, BatchNorm, and their backprops) and the related
 // helper functions.
 std::vector<int64> GetStrides(const OpInfo& op_info) {
   if (op_info.attr().find("strides") != op_info.attr().end()) {
     const auto strides = op_info.attr().at("strides").list().i();
-    CHECK(strides.size() == 4)
+    DCHECK(strides.size() == 4)
         << "Attr strides is not a length-4 vector: " << op_info.DebugString();
+    if (strides.size() != 4) return {1, 1, 1, 1};
     return {strides[0], strides[1], strides[2], strides[3]};
   }
   return {1, 1, 1, 1};
@@ -149,8 +150,9 @@ std::vector<int64> GetStrides(const OpInfo& op_info) {
 std::vector<int64> GetKernelSize(const OpInfo& op_info) {
   if (op_info.attr().find("ksize") != op_info.attr().end()) {
     const auto ksize = op_info.attr().at("ksize").list().i();
-    CHECK(ksize.size() == 4)
+    DCHECK(ksize.size() == 4)
         << "Attr ksize is not a length-4 vector: " << op_info.DebugString();
+    if (ksize.size() != 4) return {1, 1, 1, 1};
     return {ksize[0], ksize[1], ksize[2], ksize[3]};
   }
   // Note that FusedBatchNorm doesn't have ksize attr, but GetKernelSize returns
@@ -741,9 +743,12 @@ OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
   // Only check equality when both sizes are known (in other words, when
   // neither is set to a minimum dimension size of 1).
   if (iz != 1 && kz != 1) {
-    CHECK_EQ(iz % kz, 0) << "Input channel " << iz
-                         << " is not a multiple of filter channel " << kz
-                         << ".";
+    DCHECK_EQ(iz % kz, 0) << "Input channel " << iz
+                          << " is not a multiple of filter channel " << kz
+                          << ".";
+    if (iz % kz) {
+      *found_unknown_shapes = true;
+    }
   } else {
     iz = kz = std::max<int64>(iz, kz);
   }

From fa5cdeae7e508f7aba20656d963b1c73bfbd444f Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Wed, 19 Feb 2020 10:40:38 -0800
Subject: [PATCH 249/442] Add a functiondef getter to the context

PiperOrigin-RevId: 296002833
Change-Id: I238a2984a9320c084b7157e6eeb30b30aa132036
---
 tensorflow/c/eager/c_api_experimental.cc      | 19 ++++++++++++++
 tensorflow/c/eager/c_api_experimental.h       |  5 ++++
 .../core/common_runtime/eager/context.cc      |  4 +++
 .../core/common_runtime/eager/context.h       |  2 ++
 tensorflow/python/eager/context.py            | 25 +++++++++++++++++++
 tensorflow/python/eager/context_test.py       | 22 ++++++++++++++++
 tensorflow/python/tfe_wrapper.cc              |  8 ++++++
 7 files changed, 85 insertions(+)

diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index 4f97d7b0517..46f1f98b036 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -569,3 +569,22 @@ void TFE_TensorHandleEnableImplicitMirroring(TFE_TensorHandle* h,
   h->handle->EnableImplicitMirroring();
   status->status = tensorflow::Status::OK();
 }
+
+void TFE_ContextGetFunctionDef(TFE_Context* ctx, const char* function_name,
+                               TF_Buffer* buf, TF_Status* status) {
+  auto* function_def = ctx->context->FindFunctionDef(function_name);
+  if (function_def == nullptr) {
+    status->status = tensorflow::errors::NotFound(
+        "Unable to find FunctionDef with name: ", function_name);
+    return;
+  }
+  string str = function_def->SerializeAsString();
+  void* data = tensorflow::port::Malloc(str.length());
+  str.copy(static_cast<char*>(data), str.length(), 0);
+  buf->data = data;
+  buf->length = str.length();
+  buf->data_deallocator = [](void* data, size_t length) {
+    tensorflow::port::Free(data);
+  };
+  status->status = tensorflow::Status::OK();
+}
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index 075b5d02fdc..d2b632bc301 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -475,6 +475,11 @@ typedef struct TFE_CustomDevice {
 void TFE_RegisterCustomDevice(TFE_Context* ctx, TFE_CustomDevice device,
                               const char* device_name, void* device_info);
 
+TF_CAPI_EXPORT extern void TFE_ContextGetFunctionDef(TFE_Context* ctx,
+                                                     const char* function_name,
+                                                     TF_Buffer* buf,
+                                                     TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 5932ed4b698..5e151461c0e 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -622,6 +622,10 @@ Status EagerContext::AddFunctionDef(const FunctionDef& fdef,
   return Status::OK();
 }
 
+const FunctionDef* EagerContext::GetFunctionDef(const string& function_name) {
+  return func_lib_def_.Find(function_name);
+}
+
 Status EagerContext::RemoveFunction(const string& func) {
   bool is_last_ref = false;
   {
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 094e7fd8b49..58a60f00393 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -232,6 +232,8 @@ class EagerContext : public core::RefCounted {
                         const FunctionDefLibrary& library,
                         const bool add_to_local_only = false);
 
+  const FunctionDef* GetFunctionDef(const string& function_name);
+
   Status RemoveFunction(const string& func);
 
   // Clear remote executors on all worker targets in `remote_contexts_`.
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index e32e71152f0..d87c157d1e6 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -28,6 +28,7 @@ from absl import logging
 import numpy as np
 import six
 
+from tensorflow.core.framework import function_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import pywrap_tfe
@@ -1054,6 +1055,26 @@ class Context(object):
     pywrap_tfe.TFE_ContextAddFunctionDef(self._handle, fdef_string,
                                          len(fdef_string))
 
+  def get_function_def(self, name):
+    """Get a function definition from the context.
+
+    Args:
+      name: function signature name.
+
+    Returns:
+      The requested FunctionDef.
+
+    Raises:
+      tf.errors.NotFoundError: if name is not the name of a registered function.
+    """
+    with c_api_util.tf_buffer() as buffer_:
+      pywrap_tfe.TFE_ContextGetFunctionDef(self._handle, name, buffer_)
+      proto_data = pywrap_tf_session.TF_GetBuffer(buffer_)
+    function_def = function_pb2.FunctionDef()
+    function_def.ParseFromString(proto_data)
+
+    return function_def
+
   def remove_function(self, name):
     """Remove a function from the context.
 
@@ -2124,6 +2145,10 @@ def remove_function(name):
   context().remove_function(name)
 
 
+def get_function_def(name):
+  return context().get_function_def(name)
+
+
 # Not every user creates a Context via context.context()
 # (for example, enable_eager_execution in python/framework/ops.py),
 # but they do all import this file.  Note that IS_IN_GRAPH_MODE and
diff --git a/tensorflow/python/eager/context_test.py b/tensorflow/python/eager/context_test.py
index 72c363a44dd..fd815fe7433 100644
--- a/tensorflow/python/eager/context_test.py
+++ b/tensorflow/python/eager/context_test.py
@@ -24,6 +24,7 @@ import numpy as np
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
 
@@ -86,6 +87,27 @@ class ContextTest(test.TestCase):
     graph, = graphs
     self.assertIn('CPU:0', graph.node[0].device)
 
+  def testGetFunctionDef(self):
+
+    @def_function.function
+    def f():
+      return constant_op.constant(1.)
+
+    concrete = f.get_concrete_function()
+    function_def = context.get_function_def(concrete.name)
+
+    self.assertIsNot(function_def, None)
+
+    found_const_node = False
+    for node_def in function_def.node_def:
+      if node_def.op == 'Const':
+        found_const_node = True
+        break
+    self.assertTrue(found_const_node)
+
+    with self.assertRaises(errors.NotFoundError):
+      _ = context.get_function_def('this_should_not_be_found')
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index 160b817d937..7be093a1340 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -382,6 +382,14 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
                                     status.get());
           tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
         });
+  m.def("TFE_ContextGetFunctionDef",
+        [](py::handle& ctx, const char* function_name, TF_Buffer& buf) {
+          tensorflow::Safe_TF_StatusPtr status =
+              tensorflow::make_safe(TF_NewStatus());
+          TFE_ContextGetFunctionDef(tensorflow::InputTFE_Context(ctx),
+                                    function_name, &buf, status.get());
+          tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
+        });
   m.def("TFE_ContextRemoveFunction", [](py::handle& ctx, const char* name) {
     tensorflow::Safe_TF_StatusPtr status =
         tensorflow::make_safe(TF_NewStatus());

From 0763de0044bee02c8fa00b33d5a837701a90bc54 Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Wed, 19 Feb 2020 10:52:26 -0800
Subject: [PATCH 250/442] Add access to TEST_UNDECLARED_OUTPUTS_DIR which takes
 into account Bazel's use of `/` as a path separator on Windows.

Note: This will have no impact until a later change to use `\` as path separator
is checked in.
PiperOrigin-RevId: 296005736
Change-Id: Id7ada3b06e38399fd17df76fb5c8d3b0ea70e0e2
---
 tensorflow/core/platform/path.cc   | 25 +++++++++++++++++++++++++
 tensorflow/core/platform/path.h    |  9 +++++++++
 tensorflow/core/util/dump_graph.cc | 17 ++++++++---------
 3 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/platform/path.cc b/tensorflow/core/platform/path.cc
index 5c99b4eb68a..1e88328aace 100644
--- a/tensorflow/core/platform/path.cc
+++ b/tensorflow/core/platform/path.cc
@@ -40,6 +40,27 @@ namespace {
 
 const char kPathSep[] = "/";
 
+bool FixBazelEnvPath(const char* path, string* out) {
+  if (path == nullptr) return false;
+  if (out == nullptr) return true;
+
+  *out = path;
+
+#ifdef PLATFORM_WINDOWS
+  // On Windows, paths generated by Bazel are always use `/` as the path
+  // separator. This prevents normal path management. In the event there are no
+  // `\` in the path, we convert all `/` to `\`.
+  if (out->find('\\') != string::npos) return path;
+
+  for (size_t pos = out->find('/'); pos != string::npos;
+       pos = out->find('/', pos + 1)) {
+    (*out)[pos] = kPathSep[0];
+  }
+#endif
+
+  return true;
+}
+
 }  // namespace
 
 string JoinPathImpl(std::initializer_list<StringPiece> paths) {
@@ -308,5 +329,9 @@ string GetTempFilename(const string& extension) {
 #endif
 }
 
+bool GetTestUndeclaredOutputsDir(string* dir) {
+  return internal::FixBazelEnvPath(getenv("TEST_UNDECLARED_OUTPUTS_DIR"), dir);
+}
+
 }  // namespace io
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/path.h b/tensorflow/core/platform/path.h
index db0348d8960..0aa080b0fc4 100644
--- a/tensorflow/core/platform/path.h
+++ b/tensorflow/core/platform/path.h
@@ -92,6 +92,15 @@ string CreateURI(tensorflow::StringPiece scheme, tensorflow::StringPiece host,
 // Creates a temporary file name with an extension.
 string GetTempFilename(const string& extension);
 
+// Reads the TEST_UNDECLARED_OUTPUTS_DIR environment variable, and if set
+// assigns `dir` to the value. `dir` is not modified if the environment variable
+// is unset. Returns true if the environment variable is set, otherwise false.
+// Passing `dir` as nullptr, will just probe for the environment variable.
+//
+// Note: This function obviates the need to deal with Bazel's odd path decisions
+// on Windows, and should be preferred over a simple `getenv`.
+bool GetTestUndeclaredOutputsDir(string* dir);
+
 }  // namespace io
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/dump_graph.cc b/tensorflow/core/util/dump_graph.cc
index 72b21fc2da3..b68aa058649 100644
--- a/tensorflow/core/util/dump_graph.cc
+++ b/tensorflow/core/util/dump_graph.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/path.h"
 
 namespace tensorflow {
 
@@ -78,13 +79,14 @@ template <class T>
 string WriteTextProtoToUniqueFile(Env* env, const string& name,
                                   const char* proto_type, T& proto,
                                   const string& dirname) {
-  const char* dir = nullptr;
+  string dir;
   if (!dirname.empty()) {
-    dir = dirname.c_str();
+    dir = dirname;
   } else {
-    dir = getenv("TF_DUMP_GRAPH_PREFIX");
+    const char* prefix = getenv("TF_DUMP_GRAPH_PREFIX");
+    if (prefix != nullptr) dir = prefix;
   }
-  if (!dir) {
+  if (dir.empty()) {
     LOG(WARNING)
         << "Failed to dump " << name << " because dump location is not "
         << " specified through either TF_DUMP_GRAPH_PREFIX environment "
@@ -94,18 +96,15 @@ string WriteTextProtoToUniqueFile(Env* env, const string& name,
 
   if (absl::EqualsIgnoreCase(dir, "sponge") ||
       absl::EqualsIgnoreCase(dir, "test_undeclared_outputs_dir")) {
-    const char* tmp_dir = getenv("TEST_UNDECLARED_OUTPUTS_DIR");
-    if (tmp_dir == nullptr) {
+    if (!io::GetTestUndeclaredOutputsDir(&dir)) {
       LOG(WARNING) << "TF_DUMP_GRAPH_PREFIX=sponge, but "
                       "TEST_UNDECLARED_OUTPUT_DIRS is not set, dumping to log";
       dir = "-";
-    } else {
-      dir = tmp_dir;
     }
   }
 
   string filepath = "NULL";
-  if (std::strncmp(dir, "-", 2) == 0) {
+  if (dir == "-") {
     LOG(INFO) << proto.DebugString();
     filepath = "LOG(INFO)";
   } else {

From 52d570caf609b02fa6e6780630b378ed16471702 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 10:55:40 -0800
Subject: [PATCH 251/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 296006530 Change-Id:
 I97f75bc86ae4e91f21e2b50c1bcba516d009f297

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index c744d5b466a..f69affe5e8a 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21345,7 +21345,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22053,7 +22053,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22249,7 +22249,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22318,7 +22318,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22433,7 +22433,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22492,7 +22492,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22666,7 +22666,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22857,7 +22857,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25297,7 +25297,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25629,7 +25629,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25679,7 +25679,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25929,7 +25929,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26559,7 +26559,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27624,7 +27624,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45491,7 +45491,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From d4d4eab6b7cbf8e2cf3c2e312feb4513a43d2689 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Wed, 19 Feb 2020 11:02:57 -0800
Subject: [PATCH 252/442] [XLA:Python] Add an alias CustomCallWithLayout for
 CustomCall.

Change in preparation for redefining CustomCall to mean a CustomCall without layout, as in the xla_builder.h C++ API.

PiperOrigin-RevId: 296008567
Change-Id: Id1eac792c5b300f67e04e7055826d8d366993c43
---
 tensorflow/compiler/xla/python/xla.cc        |  3 +++
 tensorflow/compiler/xla/python/xla_client.py | 16 ++++++++++------
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index cf3441229f9..cd85edad13e 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -349,7 +349,10 @@ void BuildOpsSubmodule(py::module* m) {
           py::arg("precision_config") = nullptr);
   ops.def("ConvertElementType", &ConvertElementType, py::arg("operand"),
           py::arg("new_element_type"));
+  // TODO(phawkins): remove CustomCall after callers are updated to use
+  // CustomCallWithLayout.
   ops.def("CustomCall", &CustomCallWithLayout);
+  ops.def("CustomCallWithLayout", &CustomCallWithLayout);
   ops.def("Dot", &Dot, py::arg("lhs"), py::arg("rhs"),
           py::arg("precision_config") = nullptr);
   ops.def("DotGeneral", &DotGeneral, py::arg("lhs"), py::arg("rhs"),
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 997343d2109..6574ccfe898 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -1189,12 +1189,12 @@ class ComputationBuilder(object):
     return ops.Call(self._builder, computation_to_apply.computation,
                     list(operands))
 
-  def CustomCall(self,
-                 call_target_name,
-                 operands,
-                 shape_with_layout,
-                 operand_shapes_with_layout,
-                 opaque=None):
+  def CustomCallWithLayout(self,
+                           call_target_name,
+                           operands,
+                           shape_with_layout,
+                           operand_shapes_with_layout,
+                           opaque=None):
     """Enqueues a custom call operation onto the computation.
 
     Args:
@@ -1214,6 +1214,10 @@ class ComputationBuilder(object):
                           list(operands), shape_with_layout,
                           list(operand_shapes_with_layout), opaque)
 
+  # TODO(phawkins): remove CustomCall after callers are updated to use
+  # CustomCallWithLayout.
+  CustomCall = CustomCallWithLayout
+
   def Map(self, operands, computation_to_apply, dimensions):
     """Enqueues a map operation onto the computation.
 

From ee4a891f34d6f634a38eb889759f3ad49a17a22d Mon Sep 17 00:00:00 2001
From: Sean Silva <silvasean@google.com>
Date: Wed, 19 Feb 2020 11:05:30 -0800
Subject: [PATCH 253/442] Set resource subtype correctly on bound inputs.

This gives more information to shape propagation.

To do this, we need to insert a cast back to the raw imported type, but
shape propagation will eliminate that later.

Incidentally, this also standardizes the arg types used to represent bound
inputs between the V1 (SignatureDef) and V2 (ObjectGraph) importers. As
such, I've fixed a TODO in the verifier to actually verify that the args
have the right type.

PiperOrigin-RevId: 296009270
Change-Id: If5aee852ac08249c37ed8565f2e140e7b54c82d7
---
 .../mlir/tensorflow/ir/tf_saved_model.cc      | 41 ++++++++++---------
 .../tensorflow/tests/tf_saved_model/basic.py  |  2 +-
 .../tests/tf_saved_model/call_to_exported.py  |  4 +-
 .../tf_saved_model_inline_global_tensors.mlir |  4 +-
 .../tensorflow/tests/tf_saved_model_ops.mlir  |  2 +-
 .../tests/tf_saved_model_ops_invalid.mlir     | 12 +++---
 ...f_saved_model_optimize_global_tensors.mlir | 16 ++++----
 .../mlir/tensorflow/translate/import_model.cc | 39 ++++++++++++++++++
 8 files changed, 80 insertions(+), 40 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
index 21b5354eeb8..8d3253ef81f 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
@@ -112,12 +112,26 @@ static LogicalResult VerifyIndexPath(Operation *op, NamedAttribute named_attr) {
   return mlir::success();
 }
 
-// Return true if `type` is a tensor of `!tf.resource`. This is the type that is
-// used to represent mutable variables on exported functions' bound inputs.
-static bool IsResourceVarType(Type type) {
-  TensorType tensor_type = type.dyn_cast<TensorType>();
-  if (!tensor_type) return false;
-  return tensor_type.getElementType().isa<TF::ResourceType>();
+static LogicalResult VerifyBoundInputArgType(Operation *op_for_diagnostics,
+                                             Type arg_type,
+                                             GlobalTensorOp global_tensor) {
+  if (global_tensor.is_mutable()) {
+    auto expected_type = RankedTensorType::get(
+        {}, TF::ResourceType::get({global_tensor.type().cast<TensorType>()},
+                                  arg_type.getContext()));
+    if (arg_type != expected_type) {
+      return op_for_diagnostics->emitError()
+             << "mutable bound input with type " << arg_type
+             << " expected to have type " << expected_type;
+    }
+  } else {
+    if (arg_type != global_tensor.type()) {
+      return op_for_diagnostics->emitError()
+             << "bound input for immutable 'tf_saved_model.global_tensor' must "
+                "match the global tensor's type";
+    }
+  }
+  return success();
 }
 
 LogicalResult TensorFlowSavedModelDialect::verifyRegionArgAttribute(
@@ -137,20 +151,7 @@ LogicalResult TensorFlowSavedModelDialect::verifyRegionArgAttribute(
                              << symbol_name << "'";
     }
     auto arg_type = cast<FuncOp>(op).getArgument(arg_index).getType();
-    if (global_tensor.is_mutable()) {
-      if (!IsResourceVarType(arg_type)) {
-        return op->emitError()
-               << "bound inputs for mutable 'tf_saved_model.global_tensor's "
-                  "must be tensors of '!tf.resource'";
-      }
-    } else {
-      if (arg_type != global_tensor.type()) {
-        return op->emitError() << "bound input for immutable "
-                                  "'tf_saved_model.global_tensor' must "
-                                  "match the global tensor's type";
-      }
-    }
-    return success();
+    return VerifyBoundInputArgType(op, arg_type, global_tensor);
   }
   if (named_attr.first == "tf_saved_model.index_path") {
     return VerifyIndexPath(op, named_attr);
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py
index 52ed0b4ed2b..4248099637c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py
@@ -46,7 +46,7 @@ class TestModule(tf.Module):
   # CHECK: "tf_saved_model.global_tensor"() {sym_name = "[[CONST:[a-zA-Z_0-9]+]]", tf_saved_model.exported_names = [], type = tensor<f32>, value = dense<4.300000e+01> : tensor<f32>} : () -> ()
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
   # CHECK-SAME:   %arg0: tensor<f32> {tf_saved_model.index_path = [0]},
-  # CHECK-SAME:   %arg1: tensor<*x!tf.resource> {tf_saved_model.bound_input = @[[VAR]]},
+  # CHECK-SAME:   %arg1: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @[[VAR]]},
   # CHECK-SAME:   %arg2: tensor<f32> {tf_saved_model.bound_input = @[[CONST]]}) -> (
   # CHECK-SAME:   tensor<f32> {tf_saved_model.index_path = []})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["some_function"]
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/call_to_exported.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/call_to_exported.py
index 8e9e197d62f..658cc37a22f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/call_to_exported.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/call_to_exported.py
@@ -46,7 +46,7 @@ class TestModule(tf.Module):
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
   # CHECK-SAME:   %arg0: tensor<f32> {tf_saved_model.index_path = [0]},
-  # CHECK-SAME:   %arg1: tensor<*x!tf.resource> {tf_saved_model.bound_input = {{@[a-zA-Z_0-9]+}}}
+  # CHECK-SAME:   %arg1: tensor<!tf.resource<{{.*}}>> {tf_saved_model.bound_input = {{@[a-zA-Z_0-9]+}}}
   # CHECK-SAME: ) -> (
   # CHECK-SAME:   tensor<f32> {tf_saved_model.index_path = [0]},
   # CHECK-SAME:   tensor<f32> {tf_saved_model.index_path = [1]})
@@ -55,7 +55,7 @@ class TestModule(tf.Module):
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
   # CHECK-SAME:   %arg0: tensor<f32> {tf_saved_model.index_path = [0]},
-  # CHECK-SAME:   %arg1: tensor<*x!tf.resource> {tf_saved_model.bound_input = {{@[a-zA-Z_0-9]+}}}
+  # CHECK-SAME:   %arg1: tensor<!tf.resource<{{.*}}>> {tf_saved_model.bound_input = {{@[a-zA-Z_0-9]+}}}
   # CHECK-SAME: ) -> (
   # CHECK-SAME:   tensor<f32> {tf_saved_model.index_path = [0]},
   # CHECK-SAME:   tensor<*xf32> {tf_saved_model.index_path = [1]})
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_inline_global_tensors.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_inline_global_tensors.mlir
index d1e1c9d6b09..365a5a3f402 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_inline_global_tensors.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_inline_global_tensors.mlir
@@ -25,8 +25,8 @@ module attributes {tf_saved_model.semantics} {
   // CHECK: tf_saved_model.global_tensor
   "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<f32>, value = dense<1.0> : tensor<f32> } : () -> ()
 
-  // CHECK: func @f(%arg0: tensor<*x!tf.resource> {tf_saved_model.bound_input = @v})
-  func @f(%arg0: tensor<*x!tf.resource> {tf_saved_model.bound_input = @v})
+  // CHECK: func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v})
+  func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v})
   attributes {tf_saved_model.exported_names = ["f"]} {
     // CHECK-NOT: tf.Const
     return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
index cc809909f79..1bf172b2655 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops.mlir
@@ -26,7 +26,7 @@ module attributes {tf_saved_model.semantics} {
   func @__concrete_function_run_computation(
     %arg0: tensor<f32> {tf_saved_model.index_path = [0, "foo"]},
     %arg1: tensor<1x64xf32> {tf_saved_model.bound_input = @some_constant},
-    %arg2: tensor<*x!tf.resource> {tf_saved_model.bound_input = @some_variable}
+    %arg2: tensor<!tf.resource<tensor<?x64xf32>>> {tf_saved_model.bound_input = @some_variable}
   ) -> (
     tensor<f32> {tf_saved_model.index_path = [0, "bar"]}
   ) attributes { tf_saved_model.exported_names = ["some_func"] }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
index 0a5fe2708c1..6e6c8ae3821 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_ops_invalid.mlir
@@ -219,8 +219,8 @@ module attributes {tf_saved_model.semantics} {
   "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<f32>, value = dense<42.0> : tensor<f32> } : () -> ()
   // expected-error@+1 {{duplicate 'tf_saved_model.bound_input' binding}}
   func @f(
-    %arg0: tensor<*x!tf.resource> {tf_saved_model.bound_input = @v},
-    %arg1: tensor<*x!tf.resource> {tf_saved_model.bound_input = @v}
+    %arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v},
+    %arg1: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v}
   ) attributes {tf_saved_model.exported_names = ["f"]} {
     return
   }
@@ -232,9 +232,9 @@ module attributes {tf_saved_model.semantics} {
 
   "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<?xf32>, value = dense<1.> : tensor<1xf32> } : () -> ()
   // expected-error@+1 {{can only apply 'tf_saved_model' argument attributes to exported functions}}
-  func @f(%arg0: tensor<*x!tf.resource> {tf_saved_model.bound_input = @v})
+  func @f(%arg0: tensor<!tf.resource<tensor<?xf32>>> {tf_saved_model.bound_input = @v})
   -> (tensor<?xf32> {tf_saved_model.index_path = []}) {
-    %0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource>) -> tensor<?xf32>
+    %0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<?xf32>>>) -> tensor<?xf32>
     return %0 : tensor<?xf32>
   }
 }
@@ -244,7 +244,7 @@ module attributes {tf_saved_model.semantics} {
 module attributes {tf_saved_model.semantics} {
 
   "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<?xf32>, value = dense<1.> : tensor<1xf32> } : () -> ()
-  // expected-error@+1 {{bound inputs for mutable 'tf_saved_model.global_tensor's must be tensors of '!tf.resource'}}
+  // expected-error@+1 {{mutable bound input with type 'tensor<f32>' expected to have type 'tensor<!tf.resource<tensor<?xf32>>>'}}
   func @f(%arg0: tensor<f32> {tf_saved_model.bound_input = @v})
   attributes {tf_saved_model.exported_names = ["f"]} {
     return
@@ -257,7 +257,7 @@ module attributes {tf_saved_model.semantics} {
 
   "tf_saved_model.global_tensor"() { sym_name = "v", type = tensor<1xf32>, value = dense<1.> : tensor<1xf32> } : () -> ()
   // expected-error@+1 {{bound input for immutable 'tf_saved_model.global_tensor' must match the global tensor's type}}
-  func @f(%arg0: tensor<*x!tf.resource> {tf_saved_model.bound_input = @v})
+  func @f(%arg0: tensor<!tf.resource<tensor<1xf32>>> {tf_saved_model.bound_input = @v})
   attributes {tf_saved_model.exported_names = ["f"]} {
     return
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir
index 95b0bd54d70..f2a4373c777 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir
@@ -14,10 +14,10 @@ module attributes {tf_saved_model.semantics} {
   "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<f32>, value = dense<42.> : tensor<f32> } : () -> ()
 
   // CHECK: func @f(%arg0: tensor<f32> {tf_saved_model.bound_input = @v})
-  func @f(%arg0: tensor<*x!tf.resource> {tf_saved_model.bound_input = @v}) -> (tensor<f32> {tf_saved_model.index_path = []})
+  func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v}) -> (tensor<f32> {tf_saved_model.index_path = []})
   attributes {tf_saved_model.exported_names = ["f"]} {
     // CHECK-NOT: tf.ReadVariableOp
-    %val = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource>) -> tensor<f32>
+    %val = "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
     // CHECK: return %arg0
     return %val : tensor<f32>
   }
@@ -35,12 +35,12 @@ module attributes {tf_saved_model.semantics} {
   // CHECK-SAME: } : () -> ()
   "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", type = tensor<f32>, value = dense<42.> : tensor<f32> } : () -> ()
 
-  // CHECK: func @f(%arg0: tensor<*x!tf.resource> {tf_saved_model.bound_input = @v})
-  func @f(%arg0: tensor<*x!tf.resource> {tf_saved_model.bound_input = @v})
+  // CHECK: func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v})
+  func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v})
   attributes {tf_saved_model.exported_names = ["f"]} {
     %c0 = "tf.Const"() { value = dense<1.0> : tensor<f32> } : () -> tensor<f32>
     // CHECK: tf.AssignVariableOp
-    "tf.AssignVariableOp"(%arg0, %c0) : (tensor<*x!tf.resource>, tensor<f32>) -> ()
+    "tf.AssignVariableOp"(%arg0, %c0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
     return
   }
 
@@ -57,10 +57,10 @@ module attributes {tf_saved_model.semantics} {
   // CHECK-SAME: } : () -> ()
   "tf_saved_model.global_tensor"() { is_mutable, sym_name = "v", tf_saved_model.exported_names = ["v"], type = tensor<f32>, value = dense<42.> : tensor<f32> } : () -> ()
 
-  // CHECK: func @f(%arg0: tensor<*x!tf.resource> {tf_saved_model.bound_input = @v})
-  func @f(%arg0: tensor<*x!tf.resource> {tf_saved_model.bound_input = @v}) -> (tensor<f32> {tf_saved_model.index_path = []})
+  // CHECK: func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v})
+  func @f(%arg0: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @v}) -> (tensor<f32> {tf_saved_model.index_path = []})
   attributes {tf_saved_model.exported_names = ["f"]} {
-    %val = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource>) -> tensor<f32>
+    %val = "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
     return %val : tensor<f32>
   }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index f6939abdf9f..39fe17800c9 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // TF:llvm-project
 #include "mlir/IR/Module.h"  // TF:llvm-project
 #include "mlir/IR/OpDefinition.h"  // TF:llvm-project
+#include "mlir/IR/StandardTypes.h"  // TF:llvm-project
 #include "mlir/IR/Types.h"  // TF:llvm-project
 #include "tensorflow/compiler/jit/shape_inference_helpers.h"
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
@@ -2515,6 +2516,43 @@ void StructuredValueLinearizer::RecursivelyFindLeaves(
   }
 }
 
+// For exported functions with mutable bound inputs, rewrite the function
+// signature to annotate resource subtypes on the types.
+//
+// The raw imported functions have `tensor<*x!tf.resource>` as the type for
+// mutable bound inputs. Here we turn that into
+// `tensor<!tf.resource<tensor<...>>>`.
+void SetResourceSubtypes(mlir::ModuleOp module) {
+  mlir::SymbolTable symbol_table(module);
+  for (auto func : module.getOps<mlir::FuncOp>()) {
+    if (!mlir::tf_saved_model::IsExported(func)) continue;
+    mlir::OpBuilder builder(func.getBody());
+    llvm::SmallVector<mlir::Type, 4> new_input_types;
+    for (int i = 0, e = func.getNumArguments(); i < e; i++) {
+      auto arg = func.front().getArgument(i);
+      auto global_tensor =
+          mlir::tf_saved_model::LookupBoundInput(func, i, symbol_table);
+      if (global_tensor && global_tensor.is_mutable()) {
+        auto old_type = arg.getType();
+        auto new_type = mlir::RankedTensorType::get(
+            {}, mlir::TF::ResourceType::get(
+                    {global_tensor.type().cast<mlir::TensorType>()},
+                    module.getContext()));
+        arg.setType(new_type);
+        auto arg_with_original_type = builder.create<mlir::TF::CastOp>(
+            global_tensor.getLoc(), old_type, arg,
+            /*Truncate=*/builder.getBoolAttr(false));
+        arg.replaceAllUsesWith(arg_with_original_type);
+        // The RAUW replaces the arg with itself, so we need to set it back.
+        arg_with_original_type.setOperand(arg);
+      }
+      new_input_types.push_back(arg.getType());
+    }
+    func.setType(mlir::FunctionType::get(
+        new_input_types, func.getType().getResults(), module.getContext()));
+  }
+}
+
 // Reorder the ops in the module to make testing easier and less dependent
 // on implementation details such as the order of functions in the
 // FunctionDefLibrary.
@@ -2755,6 +2793,7 @@ Status CreateSavedModelIR(
           builder.getStrArrayAttr(object_names.GetExportedNames(node_id)));
     }
   }
+  SetResourceSubtypes(module);
   module.setAttr("tf_saved_model.semantics", builder.getUnitAttr());
   SortSavedModelModule(module);
   return Status::OK();

From 412e240c76b69b9915082a7ef68bd897a0345b30 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 19 Feb 2020 11:09:11 -0800
Subject: [PATCH 254/442] [TF:MLIR] Add pass to move transposes to the end of
 the block

PiperOrigin-RevId: 296010153
Change-Id: I6b424455d590f97b2c822304dd9a689032f1d7f6
---
 ...t_optimization_move_transposes_begin.mlir} |   2 +-
 ...yout_optimization_move_transposes_end.mlir |  49 ++++++++
 .../transforms/layout_optimization.cc         | 119 +++++++++++++++++-
 3 files changed, 166 insertions(+), 4 deletions(-)
 rename tensorflow/compiler/mlir/tensorflow/tests/{layout_optimization_move_transposes.mlir => layout_optimization_move_transposes_begin.mlir} (96%)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir
similarity index 96%
rename from tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes.mlir
rename to tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir
index 19b85393d78..adb9059256c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -tf-move-transposes -verify-diagnostics | FileCheck %s --dump-input=always
+// RUN: tf-opt %s -tf-move-transposes=direction=begin -verify-diagnostics | FileCheck %s --dump-input=always
 
 // CHECK-LABEL: func @move_across_single_op
 func @move_across_single_op(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir
new file mode 100644
index 00000000000..7c54bdb3889
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir
@@ -0,0 +1,49 @@
+// RUN: tf-opt %s -tf-move-transposes=direction=end -verify-diagnostics | FileCheck %s --dump-input=always
+
+// CHECK-LABEL: func @move_across_single_op
+func @move_across_single_op(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
+
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[TANH:[0-9]*]] = "tf.Tanh"(%arg0) {{.*}} tensor<1x4x4x8xf32>
+  // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[TANH]], %[[RES_PERM]]) {{.*}} tensor<1x8x4x4xf32>
+  // CHECK: return %[[RES_TRANSPOSE]]
+
+  %0 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+  %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x4x4x8xf32>, tensor<4xi64>) -> tensor<1x8x4x4xf32>
+  %2 = "tf.Tanh"(%1) : (tensor<1x8x4x4xf32>) -> tensor<1x8x4x4xf32>
+
+  return %2 : tensor<1x8x4x4xf32>
+}
+
+// CHECK-LABEL: func @move_across_multiple_ops
+func @move_across_multiple_ops(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
+
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[TANH0:[0-9]*]] = "tf.Tanh"(%arg0) {{.*}} tensor<1x4x4x8xf32>
+  // CHECK: %[[TANH1:[0-9]*]] = "tf.Tanh"(%[[TANH0]]) {{.*}} tensor<1x4x4x8xf32>
+  // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[TANH1]], %[[RES_PERM]])
+  // CHECK: return %[[RES_TRANSPOSE]]
+
+  %0 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+  %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x4x4x8xf32>, tensor<4xi64>) -> tensor<1x8x4x4xf32>
+  %2 = "tf.Tanh"(%1) : (tensor<1x8x4x4xf32>) -> tensor<1x8x4x4xf32>
+  %3 = "tf.Tanh"(%2) : (tensor<1x8x4x4xf32>) -> tensor<1x8x4x4xf32>
+
+  return %3 : tensor<1x8x4x4xf32>
+}
+
+// CHECK-LABEL: func @move_across_multi_operand_op
+func @move_across_multi_operand_op(%arg0: tensor<1x4x4x8xf32>, %arg1: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
+
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[ADD:[0-9]*]] = "tf.AddV2"(%arg0, %arg1) {{.*}} tensor<1x4x4x8xf32>
+  // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[ADD]], %[[RES_PERM]])
+  // CHECK: return %[[RES_TRANSPOSE]]
+
+  %0 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+  %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x4x4x8xf32>, tensor<4xi64>) -> tensor<1x8x4x4xf32>
+  %2 = "tf.Transpose"(%arg1, %0) : (tensor<1x4x4x8xf32>, tensor<4xi64>) -> tensor<1x8x4x4xf32>
+  %3 = "tf.AddV2"(%1, %2) : (tensor<1x8x4x4xf32>, tensor<1x8x4x4xf32>) -> tensor<1x8x4x4xf32>
+
+  return %3 : tensor<1x8x4x4xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
index 4e74ed9f0e0..ba46059e5b6 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
@@ -49,7 +49,21 @@ class LayoutAssignmentPass : public FunctionPass<LayoutAssignmentPass> {
 // delete redundant transposes.
 class MoveTransposesPass : public FunctionPass<MoveTransposesPass> {
  public:
+  enum class Direction { kBegin, kEnd };
+
+  MoveTransposesPass() = default;
+  MoveTransposesPass(const MoveTransposesPass& pass) {}
+
   void runOnFunction() final;
+
+ private:
+  Option<Direction> direction_{
+      *this, "direction",
+      llvm::cl::desc("Move transposes to the beginning or the end of the block "
+                     "where they are defined."),
+      llvm::cl::values(
+          clEnumValN(Direction::kBegin, "begin", "beginning of the block"),
+          clEnumValN(Direction::kEnd, "end", "end of the block"))};
 };
 
 using Permutation = SmallVector<int64_t, 4>;
@@ -228,20 +242,119 @@ void MoveTransposeBefore(Operation* op, SmallVector<Operation*, 8>* work_list) {
   }
 }
 
+// Move Transpose operations that permute `op` operands after the `op`.
+void MoveTransposeAfter(Operation* op, SmallVector<Operation*, 8>* work_list) {
+  // TODO(ezhulenev): Move transpose across layout sensitive operations.
+  if (!op->hasTrait<OpTrait::TF::LayoutAgnostic>()) return;
+
+  // Transpose operations that are operands of the `op`.
+  SmallVector<TransposeOp, 2> transpose_ops;
+
+  // Constant operation that defines permutation indices for operand transposes.
+  ConstOp permutation_op;
+
+  // All operation operands must be transpose operations with the same
+  // permutation indices.
+  for (OpOperand& operand : op->getOpOperands()) {
+    // Operand must be defined by a transpose op.
+    TransposeOp transpose =
+        dyn_cast_or_null<TransposeOp>(operand.get().getDefiningOp());
+    if (!transpose) return;
+
+    // With permutation defined by constant operation.
+    ConstOp perm =
+        dyn_cast_or_null<ConstOp>(transpose.getOperand(1).getDefiningOp());
+    if (!perm) return;
+
+    // With the same permutation indices.
+    auto dense_elem_attr = perm.value().dyn_cast<DenseElementsAttr>();
+    if (!dense_elem_attr) return;
+
+    if (!permutation_op) permutation_op = perm;
+
+    // Check that permutation matches for all result transposes.
+    if (perm.value() != permutation_op.value()) return;
+
+    // Add a transpose operation for later reuse only if it's used once.
+    if (transpose.getResult().hasOneUse()) transpose_ops.push_back(transpose);
+  }
+
+  // Nothing to do here.
+  if (!permutation_op) return;
+
+  // At this point we checked that we can safely move Transpose node after
+  // `op`, bypass all operands transposes, and transpose op results.
+  Location loc = op->getLoc();
+
+  // Move constant op defining result permutation to the beginning of the block.
+  permutation_op.getOperation()->moveBefore(&op->getBlock()->front());
+
+  // Bypass Transpose nodes for all operands.
+  for (OpOperand& operand : op->getOpOperands()) {
+    TransposeOp transpose =
+        dyn_cast<TransposeOp>(operand.get().getDefiningOp());
+    operand.set(transpose.getOperand(0));
+  }
+
+  // Maybe add Transpose nodes for all results (or reuse existing transposes).
+  OpBuilder builder(op);
+  builder.setInsertionPoint(op);
+
+  for (OpResult result : op->getResults()) {
+    result.setType(op->getOperand(0).getType());
+
+    // Try to push transpose further down.
+    for (Operation* user : result.getUsers()) work_list->push_back(user);
+
+    // Try to reuse operand transposes.
+    TransposeOp transpose;
+    if (!transpose_ops.empty()) {
+      transpose = transpose_ops.pop_back_val();
+      transpose.getOperation()->moveBefore(op->getNextNode());
+      transpose.setOperand(0, result);
+      transpose.setOperand(1, permutation_op);
+    } else {
+      transpose = builder.create<TransposeOp>(loc, result, permutation_op);
+    }
+
+    // Forward all users to the transpose operation.
+    result.replaceAllUsesWith(transpose);
+    transpose.setOperand(0, result);
+  }
+
+  // Remove unused transpose operations.
+  while (!transpose_ops.empty()) {
+    TransposeOp transpose = transpose_ops.pop_back_val();
+    transpose.erase();
+  }
+}
+
 void MoveTransposesPass::runOnFunction() {
   FuncOp func = getFunction();
 
   SmallVector<Operation*, 8> work_list;
 
   func.walk([&](TransposeOp transpose) {
-    for (auto operand : transpose.getOperands()) {
-      if (auto op = operand.getDefiningOp()) work_list.push_back(op);
+    if (direction_ == Direction::kBegin) {
+      // Try to push transpose before the operand operation.
+      for (auto operand : transpose.getOperands()) {
+        if (auto op = operand.getDefiningOp()) work_list.push_back(op);
+      }
+    } else {
+      // Try to push transpose after the user operation.
+      for (Operation* user : transpose.y().getUsers()) {
+        work_list.push_back(user);
+      }
     }
   });
 
   while (!work_list.empty()) {
     Operation* op = work_list.pop_back_val();
-    MoveTransposeBefore(op, &work_list);
+    if (direction_ == Direction::kBegin) {
+      MoveTransposeBefore(op, &work_list);
+    } else if (direction_ == Direction::kEnd) {
+      MoveTransposeAfter(op, &work_list);
+    }
   }
 }
 

From 6eda9f6142072d70d57e066fb643f59c5e45fb09 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Wed, 19 Feb 2020 11:22:30 -0800
Subject: [PATCH 255/442] Add check for "correct" add before inputs reordering.
 We can not reorder inputs in broadcast add.

PiperOrigin-RevId: 296013376
Change-Id: Ie3ee2cc1569fd6df7bd94c63d5e919c1f4c98c5d
---
 .../delegates/gpu/cl/inference_context.cc     | 35 +++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
index 93e284c77ca..a2a66cae0c9 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@@ -174,6 +174,38 @@ bool IsBufferBased(const TensorStorageType& type) {
          type == TensorStorageType::IMAGE_BUFFER;
 }
 
+// Generic add is add that have several runtime inputs and they are not
+// broadcasted, i.e. pointwise add for N tensors where N > 1.
+bool IsGenericAdd(const Node& node,
+                  const std::vector<Value<TensorRef<BHWC>>*>& inputs,
+                  const std::vector<Value<TensorRef<BHWC>>*>& outputs) {
+  if (inputs.size() == 1) {
+    return false;
+  }
+  const OperationType op_type = OperationTypeFromString(node.operation.type);
+  if (op_type != OperationType::ADD) {
+    return false;
+  }
+
+  const auto dst_shape = outputs[0]->tensor.shape;
+  for (int i = 0; i < inputs.size(); ++i) {
+    const auto src_shape = inputs[i]->tensor.shape;
+    if (dst_shape.b != src_shape.b && src_shape.b == 1) {
+      return false;
+    }
+    if (dst_shape.h != src_shape.h && src_shape.h == 1) {
+      return false;
+    }
+    if (dst_shape.w != src_shape.w && src_shape.w == 1) {
+      return false;
+    }
+    if (dst_shape.c != src_shape.c && src_shape.c == 1) {
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace
 
 CLNode::CLNode(CLNode&& node)
@@ -304,8 +336,7 @@ Status InferenceContext::ConvertOperations(
     // ADD can be linked.
     // In current approach "linking" tensor can be only latest written
     // tensor(during linear order of execution) among input tensors.
-    const OperationType op_type = OperationTypeFromString(node.operation.type);
-    if (inputs.size() > 1 && op_type == OperationType::ADD) {
+    if (IsGenericAdd(node, inputs, outputs)) {
       int latest_written_tensor_index = 0;
       int last_usage = tensor_usages[inputs[0]->id];
       for (int j = 1; j < inputs.size(); ++j) {

From 3ee7c31a7b28dad0df1a1c487061adeb3134b1fe Mon Sep 17 00:00:00 2001
From: Vishnuvardhan Janapati
 <46058173+jvishnuvardhan@users.noreply.github.com>
Date: Wed, 19 Feb 2020 11:40:16 -0800
Subject: [PATCH 256/442] Update Session.py

The current example provided in TF website throws an error in TF2.x as we need to disable eager to build a graph and run in session.
Please check the colab [gist here](https://colab.sandbox.google.com/gist/jvishnuvardhan/01e01bd71653f6566c68f5210fabdf65/untitled827.ipynb). Thanks!
---
 tensorflow/python/client/session.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 65ecc205369..f69618245f3 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -1514,6 +1514,7 @@ class Session(BaseSession):
   example:
 
   ```python
+  tf.compat.v1.disable_eager_execution() # need to disable eager in TF2.x
   # Build a graph.
   a = tf.constant(5.0)
   b = tf.constant(6.0)
@@ -1523,7 +1524,7 @@ class Session(BaseSession):
   sess = tf.compat.v1.Session()
 
   # Evaluate the tensor `c`.
-  print(sess.run(c))
+  print(sess.run(c)) # prints 30.0
   ```
 
   A session may own resources, such as

From 2b59e666d8fb8ba63b657b34d2e031ea2cd36597 Mon Sep 17 00:00:00 2001
From: Jiho Choi <jihochoi@google.com>
Date: Wed, 19 Feb 2020 11:54:03 -0800
Subject: [PATCH 257/442] Use a TraceMe argument.

PiperOrigin-RevId: 296021551
Change-Id: Ifc86e30e5f7cb3972192137b7a92b647a63a2aac
---
 tensorflow/core/common_runtime/executor.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 3a43a193b9e..8d650c21210 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -2160,9 +2160,8 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
                                      TaggedNodeSeq* ready) {
   profiler::TraceMe activity(
       [&]() {
-        return strings::StrCat(
-            "ExecutorPropagateOutputs:", item->kernel->name_view(),
-            "#id=", step_id_, "#");
+        return strings::StrCat("ExecutorPropagateOutputs#", "id=", step_id_,
+                               ",kernel_name=", item->kernel->name_view(), "#");
       },
       profiler::GetTFTraceMeLevel(/*is_expensive=*/false));
 

From 9c90a3e834780c67ba2d6e9b1ec82922a85a84e0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 12:05:50 -0800
Subject: [PATCH 258/442] fix windows build from cl/295978584.

PiperOrigin-RevId: 296024920
Change-Id: Ib00b33130ff67d901487a27227d5c2599a8c3d7b
---
 tensorflow/core/profiler/utils/derived_timeline.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/profiler/utils/derived_timeline.cc b/tensorflow/core/profiler/utils/derived_timeline.cc
index ef9f308965b..b94d756020f 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline.cc
@@ -302,7 +302,9 @@ void DeriveEventsFromHostTrace(const XPlane* host_trace,
         device_plane.GetOrCreateLine(kThreadIdKernelLaunch);
     launch_line.SetName(kDerivedLineKernelLaunch);
     launch_line.SetTimestampNs(std::min(device_plane_start, host_plane_start));
-    for (const auto& [group_id, group_info] : per_device_launch_info[i]) {
+    for (const auto& it : per_device_launch_info[i]) {
+      uint64 group_id = it.first;
+      const GroupLaunchInfo& group_info = it.second;
       if (auto group_name = gtl::FindOrNull(event_group_name_map, group_id)) {
         XEventBuilder device_event =
             launch_line.AddEvent(*device_plane.GetOrCreateEventMetadata(

From b8c227e22afd135cd256763af2e65513364db850 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Wed, 19 Feb 2020 12:51:16 -0800
Subject: [PATCH 259/442] Correctly record when an EagerOperation runs on a
 custom device

Should fix a performance regression where we looked up a CustomDevice each EagerExecute since we're now caching the lookup in EagerOperation (assuming the device name doesn't change and TFE_OpReset is used, like we do when executing from Python).

Also fixes a memory issue with custom device registration.

I still need to make TensorHandle::op_device_ a variant, but I think that can be split out into a separate change.

PiperOrigin-RevId: 296034610
Change-Id: I029b3b6927cd4efcf43beef8bd14ec50020eb089
---
 tensorflow/c/eager/c_api.cc                   | 10 ++--
 tensorflow/c/eager/custom_device_test.cc      | 40 ++++++++++++++-
 tensorflow/core/common_runtime/eager/BUILD    |  1 +
 .../core/common_runtime/eager/context.cc      |  2 +-
 .../common_runtime/eager/eager_operation.cc   | 14 ++++--
 .../common_runtime/eager/eager_operation.h    | 17 ++++++-
 .../core/common_runtime/eager/execute.cc      | 49 ++++++++++++-------
 .../common_runtime/eager/tensor_handle.cc     | 17 ++++---
 .../core/common_runtime/eager/tensor_handle.h |  8 +++
 .../eager/remote_copy_node.cc                 |  9 ++--
 10 files changed, 126 insertions(+), 41 deletions(-)

diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index fe31c317853..1beca1eacb7 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -1277,10 +1277,12 @@ void TFE_OpSetDevice(TFE_Op* op, const char* device_name, TF_Status* status) {
 }
 
 const char* TFE_OpGetDevice(TFE_Op* op, TF_Status* status) {
-  tensorflow::Device* device = (op->operation.Device() == nullptr)
-                                   ? op->operation.EagerContext().HostCPU()
-                                   : op->operation.Device();
-  return device->name().c_str();
+  absl::variant<tensorflow::Device*, tensorflow::CustomDevice*> variant_device =
+      (op->operation.Device() == tensorflow::kVariantDeviceNull)
+          ? op->operation.EagerContext().HostCPU()
+          : op->operation.Device();
+  return absl::visit([](auto* device) { return device->name().c_str(); },
+                     variant_device);
 }
 
 void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) {
diff --git a/tensorflow/c/eager/custom_device_test.cc b/tensorflow/c/eager/custom_device_test.cc
index be2cdd3bd1c..3a6f9d93164 100644
--- a/tensorflow/c/eager/custom_device_test.cc
+++ b/tensorflow/c/eager/custom_device_test.cc
@@ -31,6 +31,8 @@ struct LoggingDevice {
   tensorflow::string underlying_device;
   // Set to true whenever a TensorHandle is copied onto the device
   bool* arrived_flag;
+  // Set to true whenever an operation is executed
+  bool* executed_flag;
 };
 
 struct LoggedTensor {
@@ -115,6 +117,7 @@ void LoggingDeviceExecute(int num_inputs, TFE_TensorHandle** inputs,
     outputs[i] = MakeLoggedTensorHandle(dev->ctx, dev->device_name,
                                         std::move(logged_tensor), s);
   }
+  *(dev->executed_flag) = true;
 }
 
 void DeleteLoggingDevice(void* device_info) {
@@ -122,7 +125,7 @@ void DeleteLoggingDevice(void* device_info) {
 }
 
 void RegisterLoggingDevice(TFE_Context* context, const char* name,
-                           bool* arrived_flag) {
+                           bool* arrived_flag, bool* executed_flag) {
   TFE_CustomDevice custom_device;
   custom_device.copy_tensor_to_device = &CopyToLoggingDevice;
   custom_device.copy_tensor_from_device = &CopyTensorFromLoggingDevice;
@@ -131,6 +134,7 @@ void RegisterLoggingDevice(TFE_Context* context, const char* name,
   LoggingDevice* device = new LoggingDevice;
   device->ctx = context;
   device->arrived_flag = arrived_flag;
+  device->executed_flag = executed_flag;
   device->device_name = name;
   device->underlying_device = "/job:localhost/replica:0/task:0/device:CPU:0";
   TFE_RegisterCustomDevice(context, custom_device, name, device);
@@ -144,13 +148,15 @@ TEST(CUSTOM_DEVICE, RegisterSimpleDevice) {
   TFE_DeleteContextOptions(opts);
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
   bool arrived = false;
+  bool executed = false;
   const char* name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
-  RegisterLoggingDevice(context, name, &arrived);
+  RegisterLoggingDevice(context, name, &arrived, &executed);
   TFE_TensorHandle* hcpu = TestMatrixTensorHandle();
   ASSERT_FALSE(arrived);
   TFE_TensorHandle* hdevice =
       TFE_TensorHandleCopyToDevice(hcpu, context, name, status.get());
   ASSERT_TRUE(arrived);
+  ASSERT_FALSE(executed);
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
   std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> matmul(
       MatMulOp(context, hcpu, hdevice), TFE_DeleteOp);
@@ -160,6 +166,7 @@ TEST(CUSTOM_DEVICE, RegisterSimpleDevice) {
   int num_retvals = 1;
   TFE_Execute(matmul.get(), &retval, &num_retvals, status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  ASSERT_TRUE(executed);
 
   TFE_DeleteTensorHandle(retval);
   TFE_DeleteTensorHandle(hcpu);
@@ -167,4 +174,33 @@ TEST(CUSTOM_DEVICE, RegisterSimpleDevice) {
   TFE_DeleteContext(context);
 }
 
+TEST(CUSTOM_DEVICE, ResetOperation) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> context(
+      TFE_NewContext(opts, status.get()), TFE_DeleteContext);
+  TFE_DeleteContextOptions(opts);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  bool arrived = false;
+  bool executed = false;
+  const char* custom_device_name =
+      "/job:localhost/replica:0/task:0/device:CUSTOM:0";
+  RegisterLoggingDevice(context.get(), custom_device_name, &arrived, &executed);
+
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> reused_op(
+      TFE_NewOp(context.get(), "Identity", status.get()), TFE_DeleteOp);
+  TFE_OpReset(reused_op.get(), "Identity", custom_device_name, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  ASSERT_EQ(tensorflow::string(TFE_OpGetDevice(reused_op.get(), status.get())),
+            tensorflow::string(custom_device_name));
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  TFE_OpReset(reused_op.get(), "Identity",
+              "/job:localhost/replica:0/task:0/device:CPU:0", status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  ASSERT_EQ(tensorflow::string(TFE_OpGetDevice(reused_op.get(), status.get())),
+            tensorflow::string("/job:localhost/replica:0/task:0/device:CPU:0"));
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+}
+
 }  // namespace
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index bd34e70d73e..c5bde68da02 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -111,6 +111,7 @@ tf_cuda_library(
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:platform_port",
         "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:variant",
     ],
 )
 
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 5e151461c0e..f4e998a1c1e 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -748,7 +748,7 @@ Status EagerContext::FindCustomDeviceFromName(const string& device_name,
 
 void EagerContext::RegisterCustomDevice(const string& device_name,
                                         std::unique_ptr<CustomDevice> device) {
-  custom_devices_[device_name] = std::move(device);
+  custom_devices_.emplace(device_name, std::move(device));
 }
 
 bool EagerContext::OnSameTask(const Device* first, const Device* second) const {
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.cc b/tensorflow/core/common_runtime/eager/eager_operation.cc
index e84d3b0e9bf..c85079277c4 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.cc
+++ b/tensorflow/core/common_runtime/eager/eager_operation.cc
@@ -41,7 +41,6 @@ Status EagerOperation::Reset(
         "registered in the binary running in this process.");
   }
   attrs_.Reset(op);
-  device_ = nullptr;
   use_xla_ = false;
   is_function_ = is_function;
   cancellation_manager_ = nullptr;
@@ -133,11 +132,20 @@ Status EagerOperation::SetDeviceName(const char* device, const bool reset) {
           DeviceNameUtils::HasSomeDetails(device_parsed_name_)
               ? DeviceNameUtils::ParsedNameToString(device_parsed_name_)
               : "";
+      CustomDevice* custom_device;
+      if (ctx_.FindCustomDeviceFromName(device_name_, &custom_device).ok()) {
+        device_ = custom_device;
+      } else {
+        // Device placement for physical devices happens lazily in
+        // EagerExecute/EagerRemoteExecute, and can depend on the inputs.
+        device_ = kVariantDeviceNull;
+      }
     }
   } else if (reset) {
     raw_device_name_.clear();
     device_name_.clear();
     device_parsed_name_.Clear();
+    device_ = kVariantDeviceNull;
   }
   return Status::OK();
 }
@@ -160,8 +168,8 @@ string EagerOperation::DebugString() const {
 
   strings::StrAppend(&out, "Name: ", Name(), "\n");
   strings::StrAppend(&out, "Device Name: [", device_name_, "]\n");
-  strings::StrAppend(
-      &out, "Device: ", Device() ? Device()->DebugString() : "[]", "\n");
+  strings::StrAppend(&out, "Device: ", VariantDeviceDebugString(Device()),
+                     "\n");
   for (const auto& input : inputs_) {
     VLOG(1) << "Input ptr: " << input;
     strings::StrAppend(&out, "Input: ", input->DebugString(), "\n");
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index c653a92058a..cfde6f0e09d 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
 
 #include "absl/types/optional.h"
+#include "absl/types/variant.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
@@ -69,7 +70,12 @@ class EagerOperation {
   const string& Name() const { return attrs_.op_name(); }
   const AttrTypeMap* AttrTypes() const { return attr_types_; }
 
-  tensorflow::Device* Device() const { return device_; }
+  // Like TensorHandles, EagerOperations may be placed either on a virtual
+  // CustomDevice or on a physical Device.
+  absl::variant<tensorflow::Device*, tensorflow::CustomDevice*> Device() const {
+    return device_;
+  }
+
   void SetDevice(tensorflow::Device* device) {
     device_ = device;
     raw_device_name_.clear();
@@ -77,6 +83,13 @@ class EagerOperation {
     device_parsed_name_ = device->parsed_name();
   }
 
+  void SetDevice(tensorflow::CustomDevice* device) {
+    device_ = device;
+    raw_device_name_.clear();
+    device_name_ = device->name();
+    DeviceNameUtils::ParseFullName(device_name_, &device_parsed_name_);
+  }
+
   const string& GetDeviceName() const { return device_name_; }
   const DeviceNameUtils::ParsedName& GetDeviceParsedName() const {
     return device_parsed_name_;
@@ -127,7 +140,7 @@ class EagerOperation {
   AttrBuilder attrs_;
   const AttrTypeMap* attr_types_;
   gtl::InlinedVector<TensorHandle*, 4> inputs_;
-  tensorflow::Device* device_;
+  absl::variant<tensorflow::Device*, tensorflow::CustomDevice*> device_;
   string raw_device_name_;
   string device_name_;
   DeviceNameUtils::ParsedName device_parsed_name_;
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 348f7774d58..0d57a1dfe0e 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -80,6 +80,15 @@ const string& DeviceNameOrUnspecified(Device* device) {
   return (device == nullptr) ? *unspecified_string : device->name();
 }
 
+const string& DeviceNameOrUnspecified(
+    absl::variant<Device*, CustomDevice*> device) {
+  if (VariantDeviceIsCustom(device)) {
+    return absl::get<CustomDevice*>(device)->name();
+  } else {
+    return DeviceNameOrUnspecified(absl::get<Device*>(device));
+  }
+}
+
 // This function expects *handle to point to an existing tensor handle that is
 // currently on "handle_device", but where the operation expects that input to
 // reside on "expected_input_device".  The function will arrange for this
@@ -363,7 +372,7 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
   EagerContext& ctx = op->EagerContext();
   auto& executor = op->Executor();
   TF_RETURN_IF_ERROR(executor.status());
-  Device* device = op->Device();
+  Device* device = absl::get<Device*>(op->Device());
 
   Fprint128 cache_key = op->MutableAttrs()->CacheKey(op->GetDeviceName());
 
@@ -609,7 +618,7 @@ void PrepareRemoteOp(eager::Operation* remote_op, EagerOperation* op) {
   remote_op->set_name(op->Name());
 
   op->Attrs().FillAttrValueMapWithoutDefaults(remote_op->mutable_attrs());
-  remote_op->set_device(op->Device()->name());
+  remote_op->set_device(absl::get<Device*>(op->Device())->name());
   remote_op->set_is_function(op->is_function());
 }
 
@@ -640,7 +649,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   EagerContext& ctx = op->EagerContext();
 
   // TODO(fishx): Remove following code when lazy tensor copy is ready.
-  if (op->Device() == nullptr) {
+  if (op->Device() == kVariantDeviceNull) {
     tensorflow::Device* device = nullptr;
     string device_name = op->GetDeviceName();
     TF_RETURN_IF_ERROR(ctx.FindDeviceFromName(device_name.c_str(), &device));
@@ -654,7 +663,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   if (!DeviceNameUtils::GetTaskName(op->GetDeviceParsedName(), &remote_task)) {
     return errors::InvalidArgument(
         "Unable to find remote task corresponding to device ",
-        op->Device()->name());
+        VariantDeviceName(op->Device()));
   }
 
   std::unique_ptr<eager::EnqueueRequest> request(new eager::EnqueueRequest);
@@ -662,6 +671,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
 
   eager::Operation* remote_op = request->add_queue()->mutable_operation();
 
+  tensorflow::Device* op_device = absl::get<Device*>(op->Device());
   {
     profiler::TraceMe activity("CopyInputToExpectedDevice",
                                profiler::TraceMeLevel::kInfo);
@@ -674,16 +684,16 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
           absl::get<Device*>(input->DeviceOrHostCPU(ctx));
       const string* input_device_name = &input_device_or_cpu->name();
       bool serialize_resource_dtype_and_shape = false;
-      if (op->Device() != input_device &&
+      if (op_device != input_device &&
           // If the expected and actual devices are on the same task, don't
           // explicitly copy, and instead depend on the copy to happen locally
           // when the op is executed on the device.
-          !ctx.OnSameTask(op->Device(), input_device)) {
+          !ctx.OnSameTask(op_device, input_device)) {
         if (eagerly_copy_function_remote_inputs ||
             input_device_or_cpu->IsLocal()) {
           tensorflow::Device* remote_cpu_device;
           TF_RETURN_IF_ERROR(
-              ctx.CPUDeviceOnTask(op->Device(), &remote_cpu_device));
+              ctx.CPUDeviceOnTask(op_device, &remote_cpu_device));
           // TODO(b/110044833): It's possible the same tensor gets copied to the
           // remote device repeatedly.
           // Always copy to the remote CPU so that the actual device can be
@@ -695,7 +705,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
           // If the input is already on the right device, then nothing to do.
           if (remote_cpu_device != handle_device) {
             TF_RETURN_IF_ERROR(CopyInputToExpectedDevice(
-                &ctx, op, op->Device(), handle, i, handle_device,
+                &ctx, op, op_device, handle, i, handle_device,
                 remote_cpu_device, &handle));
             op->UpdateInput(i, handle);
             input = handle;
@@ -707,7 +717,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
         } else {
           serialize_resource_dtype_and_shape =
               (input->dtype == DT_RESOURCE) &&
-              (!input->HasResourceShapeMirror(op->Device()));
+              (!input->HasResourceShapeMirror(op_device));
         }
       }
       auto* input_handle = remote_op->add_inputs();
@@ -720,7 +730,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
                 input_handle->op_id(), input_handle->output_num(), remote_task,
                 context_id, &ctx);
         TF_RETURN_IF_ERROR(input->AddResourceShapeMirror(
-            std::move(tensor_handle_data), op->Device()));
+            std::move(tensor_handle_data), op_device));
       }
     }
   }
@@ -737,7 +747,6 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   }
   *num_retvals = num_outputs;
 
-  tensorflow::Device* op_device = op->Device();
   const tensorflow::uint64 id = remote_op->id();
   for (int i = 0; i < num_outputs; ++i) {
     // TODO(nareshmodi): Change the callback to instead add the decref to a
@@ -841,7 +850,9 @@ Status MaybeUpdateOpDevice(EagerOperation* op) {
   EagerContext& ctx = op->EagerContext();
   bool all_inputs_eligible_for_cpu_pinning =
       ctx.PinSmallOpsToCPU() && !op->is_function() && IsPinnableOp(op->Name());
-  Device* op_device = op->Device() == nullptr ? ctx.HostCPU() : op->Device();
+  Device* op_device = op->Device() == kVariantDeviceNull
+                          ? ctx.HostCPU()
+                          : absl::get<Device*>(op->Device());
   for (int i = 0; i < op->Inputs().size(); ++i) {
     TensorHandle* tensor_handle = op->Inputs()[i];
     if (tensor_handle->dtype == DT_RESOURCE) {
@@ -855,7 +866,7 @@ Status MaybeUpdateOpDevice(EagerOperation* op) {
       // be selected based on device priority. If any input to an op
       // is a resource we must pin it to prevent different device selection.
       // TODO(iga): null device can mean "unspecified" or "CPU". Clean this up.
-      if (resource_device != op_device || op->Device() == nullptr) {
+      if (resource_device != op_device || op->Device() == kVariantDeviceNull) {
         DVLOG(1) << (resource_device != op_device ? "Changing " : "Setting ")
                  << "device of operation " << op->Name() << " to "
                  << resource_device->name() << " because input #" << i
@@ -920,14 +931,14 @@ Status EagerExecute(EagerOperation* op, TensorHandle** retvals,
   profiler::TraceMe activity(
       [&] { return absl::StrCat("EagerExecute: ", op->Name()); },
       profiler::TraceMeLevel::kInfo);
-  TF_RETURN_IF_ERROR(MaybeUpdateOpDevice(op));
-  CustomDevice* custom_device;
-  if (op->EagerContext()
-          .FindCustomDeviceFromName(op->GetDeviceName(), &custom_device)
-          .ok()) {
-    return custom_device->Execute(op, retvals, num_retvals);
+
+  if (VariantDeviceIsCustom(op->Device())) {
+    return absl::get<CustomDevice*>(op->Device())
+        ->Execute(op, retvals, num_retvals);
   }
 
+  TF_RETURN_IF_ERROR(MaybeUpdateOpDevice(op));
+
   if (!op->Executor().Async()) {
     // In sync mode, always clear error to maintain the same behavior as before.
     // TODO(b/141004939): Remove this.
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 0a1eec32869..9e49cd1fb87 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -788,12 +788,17 @@ bool VariantDeviceIsCustom(
   return variant_device.index() != 0;
 }
 
-string VariantDeviceDebugString(
-    absl::variant<Device*, CustomDevice*> variant_device) {
-  if (VariantDeviceIsCustom(variant_device)) {
-    return absl::get<CustomDevice*>(variant_device)->name();
+string VariantDeviceName(absl::variant<Device*, CustomDevice*> device) {
+  return absl::visit([](auto* device) { return device->name(); }, device);
+}
+
+string VariantDeviceDebugString(absl::variant<Device*, CustomDevice*> device) {
+  if (device == kVariantDeviceNull) {
+    return "[]";
+  } else if (VariantDeviceIsCustom(device)) {
+    return absl::get<CustomDevice*>(device)->name();
   } else {
-    return absl::get<Device*>(variant_device)->DebugString();
+    return absl::get<Device*>(device)->DebugString();
   }
 }
 
@@ -816,7 +821,7 @@ string TensorHandle::DebugString() const {
   string device_debug = VariantDeviceDebugString(device_);
   strings::StrAppend(&out, "Device: ", device_debug);
   bool is_cpu =
-      !VariantDeviceIsCustom(device_) && absl::get<Device*>(device_) != nullptr;
+      !VariantDeviceIsCustom(device_) && device_ != kVariantDeviceNull;
   // Consider supporting non-CPU tensors and CPU tensors with a device_ set to
   // non-NULL if needed.
   strings::StrAppend(&out, ", Tensor: ",
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index dd6171d1ee0..2024111ef35 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -323,9 +323,17 @@ class TensorHandle : public core::RefCounted {
 // Checks whether a VariantDevice contains a custom device.
 bool VariantDeviceIsCustom(absl::variant<Device*, CustomDevice*> device);
 
+// Wraps device->name() or CustomDevice->name().
+string VariantDeviceName(absl::variant<Device*, CustomDevice*> device);
+
 // Wraps device->DebugString() or CustomDevice->name().
 string VariantDeviceDebugString(absl::variant<Device*, CustomDevice*> device);
 
+// Indicates either HostCPU or an unset physical device. We never set a null
+// CustomDevice*.
+const absl::variant<Device*, CustomDevice*> kVariantDeviceNull =
+    static_cast<Device*>(nullptr);
+
 // Returns the device backing the resource. Else, returns nullptr.
 Device* GetResourceDevice(const ResourceHandle& handle, EagerContext* ctx);
 
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
index edf7a0ad08b..b020ed8944e 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@@ -35,13 +35,13 @@ void PrepareRemoteOp(eager::Operation* remote_op, EagerOperation* op) {
   remote_op->set_name(op->Name());
 
   op->Attrs().FillAttrValueMap(remote_op->mutable_attrs());
-  remote_op->set_device(op->Device()->name());
+  remote_op->set_device(VariantDeviceName(op->Device()));
 }
 
 Status CreateUncachedKernelAndDeviceOp(
     EagerOperation* op, core::RefCountPtr<KernelAndDevice>* kernel) {
   EagerContext& ctx = op->EagerContext();
-  Device* device = op->Device();
+  Device* device = absl::get<Device*>(op->Device());
 
   FunctionLibraryRuntime* flr = ctx.func_lib(device);
   if (flr == nullptr) {
@@ -102,8 +102,9 @@ Status RemoteCopyNode::RunLocalSend(EagerOperation* op) {
   TF_RETURN_IF_ERROR(CreateUncachedKernelAndDeviceOp(op, &kernel));
 
   gtl::InlinedVector<TensorValue, 4> input_vector(1);
-  TF_RETURN_IF_ERROR(
-      src_->TensorValue(&input_vector[0], ctx_->CanonicalDevice(op->Device())));
+  TF_RETURN_IF_ERROR(src_->TensorValue(
+      &input_vector[0],
+      ctx_->CanonicalDevice(absl::get<Device*>(op->Device()))));
 
   EagerKernelArgs args(std::move(input_vector));
   return kernel->Run(args, /*outputs=*/nullptr,

From 8d4a54d4acbc91a1a38d6d91f9a64bef26d74437 Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Wed, 19 Feb 2020 12:56:30 -0800
Subject: [PATCH 260/442] Make dump_graph safer to use on Windows.

Avoids using `\` in filenames, and uses JoinPath for constructing paths.

PiperOrigin-RevId: 296035763
Change-Id: I9e537af31b50b7eab3d120e0aa6a0a9de1381384
---
 tensorflow/core/util/dump_graph.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/util/dump_graph.cc b/tensorflow/core/util/dump_graph.cc
index b68aa058649..50c149d48a6 100644
--- a/tensorflow/core/util/dump_graph.cc
+++ b/tensorflow/core/util/dump_graph.cc
@@ -40,7 +40,8 @@ string MakeUniqueFilename(string name) {
   // Remove illegal characters from `name`.
   for (int i = 0; i < name.size(); ++i) {
     char ch = name[i];
-    if (ch == '/' || ch == '[' || ch == ']' || ch == '*' || ch == '?') {
+    if (ch == '/' || ch == '[' || ch == ']' || ch == '*' || ch == '?' ||
+        ch == '\\') {
       name[i] = '_';
     }
   }
@@ -114,7 +115,7 @@ string WriteTextProtoToUniqueFile(Env* env, const string& name,
                    << proto_type << ": " << status;
       return "(unavailable)";
     }
-    filepath = absl::StrCat(dir, "/", MakeUniqueFilename(name));
+    filepath = io::JoinPath(dir, MakeUniqueFilename(name));
     status = WriteToFile(filepath, proto);
     if (!status.ok()) {
       LOG(WARNING) << "Failed to dump " << proto_type

From 52162196f6aa64756d4935de08797b4c6a996bc4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 13:06:40 -0800
Subject: [PATCH 261/442] make op_level_cost_estimator more robust with
 "random" input.

PiperOrigin-RevId: 296038412
Change-Id: I2a41674ffe8824bbdd5331af60264db7d10f198f
---
 tensorflow/core/grappler/costs/op_level_cost_estimator.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 5bd2162b679..aec9938afa5 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -773,6 +773,11 @@ int64 OpLevelCostEstimator::CountConv2DOperations(
   DCHECK(op_info.op() == kConv2d || op_info.op() == kDepthwiseConv2dNative)
       << "Invalid Operation: not Conv2D nor DepthwiseConv2dNative";
 
+  if (op_info.inputs_size() < 2) {  // Unexpect inputs.
+    *found_unknown_shapes = true;
+    return 0;
+  }
+
   ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
       op_info.inputs(0).shape(), op_info.inputs(1).shape(), op_info,
       found_unknown_shapes);

From de9dfedd42729a0e6db1d1f4cba0b844f9b531f8 Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Wed, 19 Feb 2020 13:09:02 -0800
Subject: [PATCH 262/442] Use GetTestUndeclaredOutputsDir to access
 TEST_UNDECLARED_OUTPUTS_DIR.

On Windows, Bazel populates environment variables with `/`s only. Changing path
manipulation logic to use `\` properly on Windows will conflict with this
behavior, requiring a layer of indirection to deal with Bazel.

PiperOrigin-RevId: 296038917
Change-Id: I9e6ae7492853966881db9c5fa53ced2383bce4aa
---
 tensorflow/compiler/xla/service/dump.cc | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/xla/service/dump.cc b/tensorflow/compiler/xla/service/dump.cc
index 05186f26ef6..3cb0eb78c5b 100644
--- a/tensorflow/compiler/xla/service/dump.cc
+++ b/tensorflow/compiler/xla/service/dump.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/regexp.h"
 
 namespace xla {
@@ -110,10 +111,7 @@ struct CanonicalDebugOptions {
     string dump_to_lower = absl::AsciiStrToLower(opts.xla_dump_to());
     if (dump_to_lower == "sponge" ||
         dump_to_lower == "test_undeclared_outputs_dir") {
-      const char* dir = getenv("TEST_UNDECLARED_OUTPUTS_DIR");
-      if (dir != nullptr) {
-        dump_to = dir;
-      } else {
+      if (!tensorflow::io::GetTestUndeclaredOutputsDir(&dump_to)) {
         LOG(ERROR) << "--xla_dump_to=" << opts.xla_dump_to()
                    << ", but environment variable TEST_UNDECLARED_OUTPUTS_DIR "
                       "is not set, so cannot dump anywhere.";

From b6687af2ffcb9ccce9ee8f852ca389e3ff94a448 Mon Sep 17 00:00:00 2001
From: "T.J. Alumbaugh" <talumbau@google.com>
Date: Wed, 19 Feb 2020 13:19:00 -0800
Subject: [PATCH 263/442] Armv8 asm fix: vector MOV

PiperOrigin-RevId: 296040987
Change-Id: I36b0e62ec2a95a6fa66644ce5bdc61f07fee168d
---
 .../lite/kernels/internal/optimized/neon_tensor_utils.cc      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index 8e0c77a8d5c..b6549a2ecf1 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -496,8 +496,8 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
           "fmul v17.4s, v17.4s, v4.4s\n"
           "b 2f\n"
           "1:\n"
-          "mov v16.4s, v4.4s\n"
-          "mov v17.4s, v4.4s\n"
+          "mov v16.16b, v4.16b\n"
+          "mov v17.16b, v4.16b\n"
           "2:\n"
           "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
           "ld1 {v8.16b}, [%[vec_ptr]], #16\n"

From f3a0e01a4069126adf5d53e4c6c5442645c94aa6 Mon Sep 17 00:00:00 2001
From: Jakob Buchgraber <buchgr@google.com>
Date: Wed, 19 Feb 2020 13:20:45 -0800
Subject: [PATCH 264/442] cuda_configure: fix quoting issue if paths contain
 spaces PiperOrigin-RevId: 296041411 Change-Id:
 If3a679f013f6b44efd9739ac5b8eab169b52ab2a

---
 third_party/gpus/cuda_configure.bzl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index caf7cccfb9f..c28cbbac2ea 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -472,11 +472,14 @@ def _check_cuda_libs(repository_ctx, script_path, libs):
         cmd += "f.write('%s' + linesep);" % line
     cmd += "f.close();"
     cmd += "from os import system;"
-    args = " ".join([path + " " + str(check) for path, check in libs])
+    args = " ".join(["\"" + path + "\" " + str(check) for path, check in libs])
     cmd += "system('%s script.py %s');" % (python_bin, args)
 
     all_paths = [path for path, _ in libs]
     checked_paths = execute(repository_ctx, [python_bin, "-c", cmd]).stdout.splitlines()
+
+    # Filter out empty lines from splitting on '\r\n' on Windows
+    checked_paths = [path for path in checked_paths if len(path) > 0]
     if all_paths != checked_paths:
         auto_configure_fail("Error with installed CUDA libs. Expected '%s'. Actual '%s'." % (all_paths, checked_paths))
 

From d7eae8706f8dd85af57b763fd986e9a8cbc5f66a Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Wed, 19 Feb 2020 13:24:33 -0800
Subject: [PATCH 265/442] Use TF_LITE_REPORT_ERROR macro instead of
 error_reporter_->Report

PiperOrigin-RevId: 296042338
Change-Id: Ia4d99b29322fb9c5465854447770ed38697dbd2a
---
 .../micro_speech/recognize_commands.cc        |  9 +++--
 .../micro_speech/recognize_commands.h         |  9 +++--
 tensorflow/lite/micro/micro_allocator.cc      | 36 ++++++++++++-------
 tensorflow/lite/micro/micro_interpreter.cc    | 28 +++++++++------
 4 files changed, 52 insertions(+), 30 deletions(-)

diff --git a/tensorflow/lite/micro/examples/micro_speech/recognize_commands.cc b/tensorflow/lite/micro/examples/micro_speech/recognize_commands.cc
index 5fd1454b49f..96f35984051 100644
--- a/tensorflow/lite/micro/examples/micro_speech/recognize_commands.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/recognize_commands.cc
@@ -38,7 +38,8 @@ TfLiteStatus RecognizeCommands::ProcessLatestResults(
   if ((latest_results->dims->size != 2) ||
       (latest_results->dims->data[0] != 1) ||
       (latest_results->dims->data[1] != kCategoryCount)) {
-    error_reporter_->Report(
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
         "The results for recognition should contain %d elements, but there are "
         "%d in an %d-dimensional shape",
         kCategoryCount, latest_results->dims->data[1],
@@ -47,7 +48,8 @@ TfLiteStatus RecognizeCommands::ProcessLatestResults(
   }
 
   if (latest_results->type != kTfLiteUInt8) {
-    error_reporter_->Report(
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
         "The results for recognition should be uint8 elements, but are %d",
         latest_results->type);
     return kTfLiteError;
@@ -55,7 +57,8 @@ TfLiteStatus RecognizeCommands::ProcessLatestResults(
 
   if ((!previous_results_.empty()) &&
       (current_time_ms < previous_results_.front().time_)) {
-    error_reporter_->Report(
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
         "Results must be fed in increasing time order, but received a "
         "timestamp of %d that was earlier than the previous one of %d",
         current_time_ms, previous_results_.front().time_);
diff --git a/tensorflow/lite/micro/examples/micro_speech/recognize_commands.h b/tensorflow/lite/micro/examples/micro_speech/recognize_commands.h
index 57a09194b35..059d567fb20 100644
--- a/tensorflow/lite/micro/examples/micro_speech/recognize_commands.h
+++ b/tensorflow/lite/micro/examples/micro_speech/recognize_commands.h
@@ -59,7 +59,8 @@ class PreviousResultsQueue {
 
   void push_back(const Result& entry) {
     if (size() >= kMaxResults) {
-      error_reporter_->Report(
+      TF_LITE_REPORT_ERROR(
+          error_reporter_,
           "Couldn't push_back latest result, too many already!");
       return;
     }
@@ -69,7 +70,8 @@ class PreviousResultsQueue {
 
   Result pop_front() {
     if (size() <= 0) {
-      error_reporter_->Report("Couldn't pop_front result, none present!");
+      TF_LITE_REPORT_ERROR(error_reporter_,
+                           "Couldn't pop_front result, none present!");
       return Result();
     }
     Result result = front();
@@ -86,7 +88,8 @@ class PreviousResultsQueue {
   // queue.
   Result& from_front(int offset) {
     if ((offset < 0) || (offset >= size_)) {
-      error_reporter_->Report("Attempt to read beyond the end of the queue!");
+      TF_LITE_REPORT_ERROR(error_reporter_,
+                           "Attempt to read beyond the end of the queue!");
       offset = size_ - 1;
     }
     int index = front_index_ + offset;
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index c693b9023ce..428d15e0f0a 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -325,7 +325,8 @@ TfLiteStatus InitializeRuntimeTensor(
 TfLiteStatus MicroAllocator::Init() {
   auto* subgraphs = model_->subgraphs();
   if (subgraphs->size() != 1) {
-    error_reporter_->Report("Only 1 subgraph is currently supported.\n");
+    TF_LITE_REPORT_ERROR(error_reporter_,
+                         "Only 1 subgraph is currently supported.\n");
     return kTfLiteError;
   }
   subgraph_ = (*subgraphs)[0];
@@ -338,7 +339,8 @@ TfLiteStatus MicroAllocator::Init() {
           sizeof(TfLiteTensor) * context_->tensors_size,
           alignof(TfLiteTensor)));
   if (context_->tensors == nullptr) {
-    error_reporter_->Report(
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
         "Failed to allocate memory for context->tensors, %d bytes required",
         sizeof(TfLiteTensor) * context_->tensors_size);
     return kTfLiteError;
@@ -350,7 +352,8 @@ TfLiteStatus MicroAllocator::Init() {
         memory_allocator_, *tensors_->Get(i), model_->buffers(),
         error_reporter_, &context_->tensors[i]);
     if (status == kTfLiteError) {
-      error_reporter_->Report("Failed to initialize tensor %d", i);
+      TF_LITE_REPORT_ERROR(error_reporter_, "Failed to initialize tensor %d",
+                           i);
       return kTfLiteError;
     }
   }
@@ -375,7 +378,8 @@ MicroAllocator::MicroAllocator(TfLiteContext* context, const Model* model,
   // failures in the constructor is to have a static function that returns a
   // pointer to the class. If allocation failed, a nullptr will be returned.
   if (status != kTfLiteOk) {
-    error_reporter_->Report("MicroAllocator: Failed to initialize.");
+    TF_LITE_REPORT_ERROR(error_reporter_,
+                         "MicroAllocator: Failed to initialize.");
     active_ = false;
   } else {
     active_ = true;
@@ -394,7 +398,8 @@ TfLiteStatus MicroAllocator::AllocateNodeAndRegistrations(
           sizeof(NodeAndRegistration) * operators_->size(),
           alignof(NodeAndRegistration)));
   if (output == nullptr) {
-    error_reporter_->Report(
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
         "Failed to allocate memory for node_and_registrations.");
     return kTfLiteError;
   }
@@ -405,28 +410,31 @@ TfLiteStatus MicroAllocator::AllocateNodeAndRegistrations(
     const auto* op = operators_->Get(i);
     size_t index = op->opcode_index();
     if (index >= opcodes->size()) {
-      error_reporter_->Report("Missing registration for opcode_index %d\n",
-                              index);
+      TF_LITE_REPORT_ERROR(error_reporter_,
+                           "Missing registration for opcode_index %d\n", index);
       return kTfLiteError;
     }
     auto* opcode = (*opcodes)[index];
     status = GetRegistrationFromOpCode(opcode, op_resolver, error_reporter_,
                                        &(output[i].registration));
     if (status != kTfLiteOk) {
-      error_reporter_->Report("Failed to get registration from op code % d\n ",
-                              opcode);
+      TF_LITE_REPORT_ERROR(error_reporter_,
+                           "Failed to get registration from op code % d\n ",
+                           opcode);
       return status;
     }
     const auto* registration = output[i].registration;
     if (registration == nullptr) {
-      error_reporter_->Report("Skipping op for opcode_index %d\n", index);
+      TF_LITE_REPORT_ERROR(error_reporter_, "Skipping op for opcode_index %d\n",
+                           index);
       return kTfLiteError;
     }
     BuiltinOperator op_type =
         static_cast<BuiltinOperator>(registration->builtin_code);
 
     if (op_type != BuiltinOperator_CUSTOM && op->custom_options()) {
-      error_reporter_->Report(
+      TF_LITE_REPORT_ERROR(
+          error_reporter_,
           "Unsupported behavior: found builtin operator %s with custom "
           "options.\n",
           EnumNameBuiltinOperator(op_type));
@@ -502,7 +510,8 @@ TfLiteStatus MicroAllocator::FinishTensorAllocation() {
         arena_size - memory_allocator_->GetDataSize();
     // Make sure we have enough arena size.
     if (planner.GetMaximumMemorySize() > actual_available_arena_size) {
-      error_reporter_->Report(
+      TF_LITE_REPORT_ERROR(
+          error_reporter_,
           "Arena size is too small for activation buffers. Needed %d but only "
           "%d was available.",
           planner.GetMaximumMemorySize(), remaining_arena_size);
@@ -517,7 +526,8 @@ TfLiteStatus MicroAllocator::FinishTensorAllocation() {
   // them from the tail (persistent area).
   if (AllocateVariables(tensors_, context_->tensors, memory_allocator_) !=
       kTfLiteOk) {
-    error_reporter_->Report(
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
         "Failed to allocate variables. Please increase arena size.");
     return kTfLiteError;
   }
diff --git a/tensorflow/lite/micro/micro_interpreter.cc b/tensorflow/lite/micro/micro_interpreter.cc
index 2326c2d2163..45254e04d7e 100644
--- a/tensorflow/lite/micro/micro_interpreter.cc
+++ b/tensorflow/lite/micro/micro_interpreter.cc
@@ -167,7 +167,8 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
       if (registration->prepare) {
         TfLiteStatus prepare_status = registration->prepare(&context_, node);
         if (prepare_status != kTfLiteOk) {
-          error_reporter_->Report(
+          TF_LITE_REPORT_ERROR(
+              error_reporter_,
               "Node %s (number %d) failed to prepare with status %d",
               OpNameFromRegistration(registration), i, prepare_status);
           return kTfLiteError;
@@ -181,7 +182,8 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
 
 TfLiteStatus MicroInterpreter::Invoke() {
   if (initialization_status_ != kTfLiteOk) {
-    error_reporter_->Report("Invoke() called after initialization failed\n");
+    TF_LITE_REPORT_ERROR(error_reporter_,
+                         "Invoke() called after initialization failed\n");
     return kTfLiteError;
   }
 
@@ -198,7 +200,8 @@ TfLiteStatus MicroInterpreter::Invoke() {
     if (registration->invoke) {
       TfLiteStatus invoke_status = registration->invoke(&context_, node);
       if (invoke_status != kTfLiteOk) {
-        error_reporter_->Report(
+        TF_LITE_REPORT_ERROR(
+            error_reporter_,
             "Node %s (number %d) failed to invoke with status %d",
             OpNameFromRegistration(registration), i, invoke_status);
         return kTfLiteError;
@@ -212,8 +215,9 @@ TfLiteTensor* MicroInterpreter::input(size_t index) {
   const flatbuffers::Vector<int32_t>* inputs = subgraph_->inputs();
   const size_t length = inputs->size();
   if ((index < 0) || (index >= length)) {
-    error_reporter_->Report("Input index %d out of range (length is %d)", index,
-                            length);
+    TF_LITE_REPORT_ERROR(error_reporter_,
+                         "Input index %d out of range (length is %d)", index,
+                         length);
     return nullptr;
   }
   return &(context_.tensors[inputs->Get(index)]);
@@ -223,8 +227,9 @@ TfLiteTensor* MicroInterpreter::output(size_t index) {
   const flatbuffers::Vector<int32_t>* outputs = subgraph_->outputs();
   const size_t length = outputs->size();
   if ((index < 0) || (index >= outputs->size())) {
-    error_reporter_->Report("Output index %d out of range (length is %d)",
-                            index, length);
+    TF_LITE_REPORT_ERROR(error_reporter_,
+                         "Output index %d out of range (length is %d)", index,
+                         length);
     return nullptr;
   }
   return &(context_.tensors[outputs->Get(index)]);
@@ -233,8 +238,9 @@ TfLiteTensor* MicroInterpreter::output(size_t index) {
 TfLiteTensor* MicroInterpreter::tensor(size_t index) {
   const size_t length = tensors_size();
   if ((index < 0) || (index >= tensors_size())) {
-    error_reporter_->Report("Tensor index %d out of range (length is %d)",
-                            index, length);
+    TF_LITE_REPORT_ERROR(error_reporter_,
+                         "Tensor index %d out of range (length is %d)", index,
+                         length);
     return nullptr;
   }
   return &context_.tensors[index];
@@ -247,8 +253,8 @@ TfLiteStatus MicroInterpreter::ResetVariableTensors() {
     if (cur_tensor->is_variable) {
       TfLiteStatus status = tflite::ResetVariableTensor(cur_tensor);
       if (status != kTfLiteOk) {
-        error_reporter_->Report("Failed to reset variable tensor at index: %d",
-                                i);
+        TF_LITE_REPORT_ERROR(error_reporter_,
+                             "Failed to reset variable tensor at index: %d", i);
         return status;
       }
     }

From 94dcf382b8593415b668e7d41ad4d203ec1a4305 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 13:30:13 -0800
Subject: [PATCH 266/442] Add pfor converter for StatelessIf.

PiperOrigin-RevId: 296043610
Change-Id: Ic91fc5c5d1cf44928bfbb8b0a13c7f304564c214
---
 .../ops/parallel_for/control_flow_ops_test.py |  55 +++++++-
 tensorflow/python/ops/parallel_for/pfor.py    | 128 +++++++++++++++---
 2 files changed, 162 insertions(+), 21 deletions(-)

diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 7d4d77a866e..65cbdbe4503 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -40,6 +40,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.keras.layers import core as keras_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bitwise_ops
+from tensorflow.python.ops import cond_v2
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_nn_ops
@@ -962,7 +963,7 @@ class StackTest(PForTestCase):
 
 # TODO(agarwal): test nested while_loops. This currently requires converting a
 # tf.cond.
-class ControlFlowTest(PForTestCase):
+class WhileV1Test(PForTestCase):
 
   def test_while_outside_loop(self):
 
@@ -1211,6 +1212,58 @@ def create_dynamic_lstm(cell_fn, batch_size, state_size, max_steps):
   return pfor_output, tf_output
 
 
+@test_util.run_all_in_graph_and_eager_modes
+@test_util.with_control_flow_v2
+class StatelessIfTest(PForTestCase):
+
+  def test_loop_variant_cond(self):
+    x = [1, 2, 3, 4, 5.]
+    y = 2.5
+
+    @def_function.function
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      # Note that the output has a combination of then and else branches being
+      # loop variant / invariant.
+      return cond_v2.cond_v2(
+          x_i < y,
+          lambda: (y - x_i, y, 1., 2.),
+          lambda: (x_i - y, 0., y, 3.))
+
+    self._test_loop_fn(loop_fn, iters=5)
+
+  def test_loop_invariant_cond(self):
+    x = [1, 2, 3, 4, 5.]
+    y = 0.5
+    z = random_ops.random_uniform([])
+
+    @def_function.function
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      # Note that the output has a combination of then and else branches being
+      # loop variant / invariant.
+      return cond_v2.cond_v2(
+          z < y,
+          lambda: (y - x_i, y, 1., 2.),
+          lambda: (x_i - y, 0., y, 3.))
+
+    self._test_loop_fn(loop_fn, iters=5)
+
+  def test_empty_branch(self):
+    x = [1, 2, 3, 4, 5.]
+    y = 6.
+
+    @def_function.function
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      return cond_v2.cond_v2(
+          x_i < y,  # Note that else branch is empty.
+          lambda: (y - x_i, y, 1., 2.),
+          lambda: (x_i - y, 0., y, 3.))
+
+    self._test_loop_fn(loop_fn, iters=5)
+
+
 class RNNTest(PForTestCase):
 
   @test_util.run_v1_only("b/122612051")
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index b01f9a6aba4..88f31210ddb 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -74,6 +74,8 @@ flags.DEFINE_bool(
 def _stack(t, length):
   """stacks `t` `length` times."""
   ones = array_ops.ones_like(array_ops.shape(t))
+  ones = array_ops.reshape(ones, [-1])
+  length = array_ops.reshape(length, [-1])
   multiples = array_ops.concat([length, ones], 0)
   t = array_ops.tile(array_ops.expand_dims(t, 0), multiples)
   return wrap(t, True)
@@ -3583,6 +3585,33 @@ def _convert_parse_example_v2(pfor_input):
 # functional_ops
 
 
+def _convert_function_call(func, converter, inputs):
+  assert isinstance(func.graph, func_graph.FuncGraph), func
+  assert isinstance(converter, PFor)
+
+  # TODO(agarwal): consider caching this function definition.
+  @def_function.function
+  def f(*args):
+    assert all(isinstance(arg, WrappedTensor) for arg in args), args
+    assert len(args) == len(func.graph.inputs), (args, func.graph.inputs)
+    #  Map inputs to function arguments.
+    for inp, arg in zip(func.graph.inputs, args):
+      converter._add_conversion(inp, arg)
+    # Convert output tensors.
+    return tuple(
+        [converter._convert_helper(x).t for x in func._func_graph_outputs])
+
+  call_outputs = f(*inputs)
+  assert len(call_outputs) == len(func._func_graph_outputs)
+  outputs = []
+  for call_output, output_tensor in zip(call_outputs, func._func_graph_outputs):
+    func_output = converter._convert_helper(output_tensor)
+    outputs.append(
+        wrap(call_output, func_output.is_stacked,
+             func_output.is_sparse_stacked))
+  return outputs
+
+
 @RegisterPFor("StatefulPartitionedCall")
 @RegisterPFor("PartitionedCall")
 def _convert_partitioned_call(pfor_input):
@@ -3598,28 +3627,87 @@ def _convert_partitioned_call(pfor_input):
       all_indices=pfor.all_indices,
       all_indices_partitioned=pfor.all_indices_partitioned,
       pfor_config=pfor.pfor_config)
+  return _convert_function_call(func, converter, pfor_input.inputs)
 
-  # TODO(agarwal): consider caching this function definition.
-  @def_function.function
-  def f(*args):
-    assert all(isinstance(arg, WrappedTensor) for arg in args), args
-    assert len(args) == len(func.graph.inputs), (args, func.graph.inputs)
-    #  Map inputs to function arguments.
-    for inp, arg in zip(func.graph.inputs, args):
-      converter._add_conversion(inp, arg)
-    # Convert output tensors.
-    return tuple(
-        [converter._convert_helper(x).t for x in func._func_graph_outputs])
 
-  call_outputs = f(*pfor_input.inputs)
-  assert len(call_outputs) == len(func._func_graph_outputs)
-  outputs = []
-  for call_output, output_tensor in zip(call_outputs, func._func_graph_outputs):
-    func_output = converter._convert_helper(output_tensor)
-    outputs.append(
-        wrap(call_output, func_output.is_stacked,
-             func_output.is_sparse_stacked))
-  return outputs
+def _partition_inputs_for_indices(inputs, indices):
+  new_inputs = []
+  for inp in inputs:
+    if inp.is_stacked:
+      new_inputs.append(wrap(array_ops.gather(inp.t, indices), True))
+    else:
+      new_inputs.append(inp)
+  return new_inputs
+
+
+def _outputs_for_branch(func_name, indices, pfor_input, inputs):
+  if indices is None:
+    indices = pfor_input.pfor.all_indices
+    partitioned = pfor_input.pfor.all_indices_partitioned
+  else:
+    partitioned = True
+  func = pfor_input.op.graph._get_function(func_name)
+  converter = PFor(
+      loop_var=pfor_input.pfor.loop_var,
+      loop_len=array_ops.size(indices),
+      pfor_ops=func.graph.get_operations(),
+      all_indices=indices,
+      all_indices_partitioned=partitioned,
+      pfor_config=pfor_input.pfor.pfor_config)
+  outputs = _convert_function_call(func, converter, inputs)
+  stacked_outputs = []
+  for out in outputs:
+    if not out.is_stacked:
+      stacked_outputs.append(_stack(out.t, array_ops.size(indices)).t)
+    else:
+      stacked_outputs.append(out.t)
+  return stacked_outputs
+
+
+@RegisterPFor("StatelessIf")
+def _convert_stateless_if(pfor_input):
+  cond, cond_stacked, _ = pfor_input.input(0)
+  inputs = pfor_input.inputs[1:]
+  then_branch = pfor_input.get_attr("then_branch")
+  else_branch = pfor_input.get_attr("else_branch")
+
+  if cond_stacked:
+    cond_int = math_ops.cast(cond, dtypes.int32)
+    # Compute loop indices for the different branches
+    false_indices, true_indices = data_flow_ops.dynamic_partition(
+        pfor_input.pfor.all_indices, cond_int, 2)
+    # Compute indices for cond being True or False.
+    if pfor_input.pfor.all_indices_partitioned:
+      else_indices, then_indices = data_flow_ops.dynamic_partition(
+          array_ops.range(len(pfor_input.pfor.all_indices)), cond_int, 2)
+    else:
+      else_indices, then_indices = false_indices, true_indices
+    # Partition inputs
+    then_inputs = _partition_inputs_for_indices(inputs, then_indices)
+    else_inputs = _partition_inputs_for_indices(inputs, else_indices)
+
+    # Convert "then" branch.
+    then_outputs = _outputs_for_branch(then_branch.name, true_indices,
+                                       pfor_input, then_inputs)
+
+    # Convert "else" branch.
+    else_outputs = _outputs_for_branch(else_branch.name, false_indices,
+                                       pfor_input, else_inputs)
+
+    assert len(then_outputs) == len(else_outputs)
+    outputs = []
+    # Merge outputs
+    for then_output, else_output in zip(then_outputs, else_outputs):
+      out = data_flow_ops.dynamic_stitch([then_indices, else_indices],
+                                         [then_output, else_output])
+      outputs.append(wrap(out, True))
+    return outputs
+  else:
+    outputs = control_flow_ops.cond(
+        cond,
+        lambda: _outputs_for_branch(then_branch.name, None, pfor_input, inputs),
+        lambda: _outputs_for_branch(else_branch.name, None, pfor_input, inputs))
+    return [wrap(t, True) for t in outputs]
 
 
 # spectral_ops

From 446566f97c01c12116bafde8de9631fe8e029ab9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 13:35:03 -0800
Subject: [PATCH 267/442] Clean up redundant bazel bindings no longer used by
 grpc

PiperOrigin-RevId: 296044838
Change-Id: I34df2b59a5f02ac5da4ae1a5cd4de2054019f25f
---
 tensorflow/workspace.bzl | 44 ----------------------------------------
 1 file changed, 44 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 6d74a7fed92..c7160a9ffbd 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -1059,13 +1059,6 @@ def tf_bind():
     # If that ends up being the case, please leave a comment explaining
     # why we can't depend on the canonical build target.
 
-    # gRPC wants a cares dependency but its contents is not actually
-    # important since we have set GRPC_ARES=0 in .bazelrc
-    native.bind(
-        name = "cares",
-        actual = "@com_github_nanopb_nanopb//:nanopb",
-    )
-
     # Needed by Protobuf
     native.bind(
         name = "grpc_cpp_plugin",
@@ -1086,37 +1079,6 @@ def tf_bind():
         actual = "@com_github_grpc_grpc//:grpc++_unsecure",
     )
 
-    # Needed by gRPC
-    native.bind(
-        name = "libssl",
-        actual = "@boringssl//:ssl",
-    )
-
-    # Needed by gRPC
-    native.bind(
-        name = "nanopb",
-        actual = "@com_github_nanopb_nanopb//:nanopb",
-    )
-
-    # Needed by gRPC
-    native.bind(
-        name = "protobuf",
-        actual = "@com_google_protobuf//:protobuf",
-    )
-
-    # gRPC expects //external:protobuf_clib and //external:protobuf_compiler
-    # to point to Protobuf's compiler library.
-    native.bind(
-        name = "protobuf_clib",
-        actual = "@com_google_protobuf//:protoc_lib",
-    )
-
-    # Needed by gRPC
-    native.bind(
-        name = "protobuf_headers",
-        actual = "@com_google_protobuf//:protobuf_headers",
-    )
-
     # Needed by Protobuf
     native.bind(
         name = "python_headers",
@@ -1128,9 +1090,3 @@ def tf_bind():
         name = "six",
         actual = "@six_archive//:six",
     )
-
-    # Needed by gRPC
-    native.bind(
-        name = "zlib",
-        actual = "@zlib",
-    )

From 622e25e687e4ddc8dfba9494b374d79bf19df8d6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 13:35:48 -0800
Subject: [PATCH 268/442] Removes a spurious check from
 AddRematerializedInstruction().

It is possible to rematerialize an instruction even if it defines a buffer that is being used by the instruction that is currently in progress. Even though that particular buffer does not help to reduce the memory usage, it may still be beneficial to rematerialize the instruction due to other buffers that it defines.

PiperOrigin-RevId: 296045046
Change-Id: I8236a04f420b341da9284b842484b815ccea4584
---
 tensorflow/compiler/xla/service/hlo_rematerialization.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 5a34c502071..21be4216469 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -989,7 +989,6 @@ Status MemoryUsageTracker::AddRematerializedInstruction(Item* original_item,
     ItemList unplaced_users;
     for (Item* user : old_buffer.users) {
       if (user->placed) {
-        CHECK(IsFinished(user)) << user->instruction->name();
         placed_users.push_back(user);
       } else {
         unplaced_users.push_back(user);

From 1b2738a31c6362f8954386c05cbd0ead153c6dbc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 13:56:08 -0800
Subject: [PATCH 269/442] make op_level_cost_estimator more robust with
 "random" input part 2

PiperOrigin-RevId: 296050162
Change-Id: I398aa4cfcf8bf4d007095e02800c84e20d1fc2bb
---
 tensorflow/core/grappler/costs/op_level_cost_estimator.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index aec9938afa5..fe5f12061f5 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -1871,6 +1871,7 @@ Costs OpLevelCostEstimator::PredictMaxPoolGrad(
   // x: op_info.inputs(0)
   // y: op_info.inputs(1)
   // y_grad: op_info.inputs(2)
+  if (op_info.inputs_size() < 3) return Costs::ZeroCosts(/*inaccurate=*/true);
   ConvolutionDimensions dims = OpDimensionsFromInputs(
       op_info.inputs(0).shape(), op_info, &found_unknown_shapes);
 

From 6edc8c2a9a34ac9b4f6fb78c61fdf0b795f457df Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 14:08:08 -0800
Subject: [PATCH 270/442] Export public symbols for programmatic profiling
 APIs.

PiperOrigin-RevId: 296053405
Change-Id: I2d52a45e61bcb37d39d45be89c26a6a3f8f3ff1b
---
 tensorflow/python/BUILD                       |  1 +
 tensorflow/python/__init__.py                 |  1 +
 tensorflow/python/profiler/profiler_v2.py     | 26 ++++++++++++++-----
 .../tools/api/generator/api_init_files.bzl    |  3 +++
 .../tools/api/golden/v2/tensorflow.pbtxt      |  4 +++
 ...rflow.profiler.experimental.-profile.pbtxt |  9 +++++++
 .../v2/tensorflow.profiler.experimental.pbtxt | 19 ++++++++++++++
 ...sorflow.profiler.experimental.server.pbtxt |  7 +++++
 .../api/golden/v2/tensorflow.profiler.pbtxt   |  7 +++++
 9 files changed, 71 insertions(+), 6 deletions(-)
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.-profile.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.server.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.profiler.pbtxt

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index dfed8ce0402..583d16e7b26 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -204,6 +204,7 @@ py_library(
         "//tensorflow/python/ops/ragged",
         "//tensorflow/python/ops/signal",
         "//tensorflow/python/profiler",
+        "//tensorflow/python/profiler:profiler_v2",
         "//tensorflow/python/saved_model",
         "//tensorflow/python/tools:module_util",
         "//tensorflow/python/tools/api/generator:create_python_api",
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 97eb7111fa5..7a9eac7931e 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -111,6 +111,7 @@ from tensorflow.python.ops.linalg.sparse import sparse
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.ops.signal import signal
 from tensorflow.python.profiler import profiler
+from tensorflow.python.profiler import profiler_v2
 from tensorflow.python.saved_model import saved_model
 from tensorflow.python.summary import summary
 from tensorflow.python.tpu import api
diff --git a/tensorflow/python/profiler/profiler_v2.py b/tensorflow/python/profiler/profiler_v2.py
index 8401ed43031..afbe1ec5881 100644
--- a/tensorflow/python/profiler/profiler_v2.py
+++ b/tensorflow/python/profiler/profiler_v2.py
@@ -39,11 +39,13 @@ import threading
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.profiler.internal import _pywrap_profiler
+from tensorflow.python.util.tf_export import tf_export
 
 _profiler = None
 _profiler_lock = threading.Lock()
 
 
+@tf_export('profiler.experimental.start', v1=[])
 def start(logdir):
   """Starts profiling.
 
@@ -55,9 +57,9 @@ def start(logdir):
 
   Example usage:
   ```python
-  tf.profiler.start('logdir_path')
+  tf.profiler.experimental.start('logdir_path')
   # do your training here.
-  tf.profiler.stop()
+  tf.profiler.experimental.stop()
   ```
 
   Launch TensorBoard and point it to the same logdir you provided to this API.
@@ -81,10 +83,11 @@ def start(logdir):
                                       'Another profiler is running.')
 
 
+@tf_export('profiler.experimental.stop', v1=[])
 def stop(save=True):
   """Stops the current profiling session.
 
-  The profiler session will be stopped and profile results will be saved.
+  The profiler session will be stopped and profile results can be saved.
 
   Args:
     save: An optional variable to save the results to TensorBoard. Default True.
@@ -103,6 +106,7 @@ def stop(save=True):
     _profiler = None
 
 
+@tf_export('profiler.experimental.server.start', v1=[])
 def start_server(port):
   """Start a profiler grpc server that listens to given port.
 
@@ -111,16 +115,26 @@ def start_server(port):
 
   Args:
     port: port profiler server listens to.
+
+  Example usage:
+  ```python
+  tf.profiler.experimental.server.start('6009')
+  # do your training here.
+
   """
   _pywrap_profiler.start_server(port)
 
 
-class Profiler(object):
-  """Context-manager profiler API.
+@tf_export('profiler.experimental.Profile', v1=[])
+class Profile(object):
+  """Context-manager profile API.
+
+  Profiling will start when entering the scope, and stop and save the results to
+  the logdir when exits the scope. Open TensorBoard profile tab to view results.
 
   Example usage:
   ```python
-  with Profiler("/path/to/logdir"):
+  with tf.profiler.experimental.Profile("/path/to/logdir"):
     # do some work
   ```
   """
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index cd7b258cb07..8542c745bb4 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -48,6 +48,9 @@ TENSORFLOW_API_INIT_FILES = [
     "mlir/experimental/__init__.py",
     "nest/__init__.py",
     "nn/__init__.py",
+    "profiler/__init__.py",
+    "profiler/experimental/__init__.py",
+    "profiler/experimental/server/__init__.py",
     "quantization/__init__.py",
     "ragged/__init__.py",
     "random/__init__.py",
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 514addea995..c56730870eb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -292,6 +292,10 @@ tf_module {
     name: "optimizers"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "profiler"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "qint16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.-profile.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.-profile.pbtxt
new file mode 100644
index 00000000000..c777d3705d9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.-profile.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.profiler.experimental.Profile"
+tf_class {
+  is_instance: "<class \'tensorflow.python.profiler.profiler_v2.Profile\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'logdir\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.pbtxt
new file mode 100644
index 00000000000..9c503abf268
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.profiler.experimental"
+tf_module {
+  member {
+    name: "Profile"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "server"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "start"
+    argspec: "args=[\'logdir\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stop"
+    argspec: "args=[\'save\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.server.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.server.pbtxt
new file mode 100644
index 00000000000..9f677df3771
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.server.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.profiler.experimental.server"
+tf_module {
+  member_method {
+    name: "start"
+    argspec: "args=[\'port\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.pbtxt
new file mode 100644
index 00000000000..31a9adb2384
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.profiler.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.profiler"
+tf_module {
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
+}

From 2e667319a5e18c0b1caafb2f7c4f8387a1ab747e Mon Sep 17 00:00:00 2001
From: Reed Wanderman-Milne <reedwm@google.com>
Date: Wed, 19 Feb 2020 14:09:46 -0800
Subject: [PATCH 271/442] Run mixed precision tests in more cases.

PiperOrigin-RevId: 296053767
Change-Id: I4bcc64b9f09046b23cab0fd76e017f581242bfee
---
 .../experimental/keras_test.py                | 23 ++++++-------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
index f1bf1f2bde2..8ec8d914cf5 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
@@ -421,13 +421,11 @@ class KerasLayerTest(keras_parameterized.TestCase):
 class KerasModelTest(keras_parameterized.TestCase):
   """Test mixed precision with Keras models."""
 
-  def _skip_if_strategy_unsupported(self, strategy_fn, check_model_type=False):
+  def _skip_if_strategy_unsupported(self, strategy_fn):
     if (strategy_fn != default_strategy_fn and
-        (testing_utils.should_run_eagerly() or
-         (check_model_type and testing_utils.get_model_type() == 'subclass'))):
+        testing_utils.get_model_type() == 'subclass'):
       self.skipTest('Non-default strategies are unsupported with subclassed '
-                    'models or with passing run_eagerly=True to '
-                    'Model.compile()')
+                    'models')
 
   def _skip_if_save_format_unsupported(self, save_format):
     model_type = testing_utils.get_model_type()
@@ -435,8 +433,8 @@ class KerasModelTest(keras_parameterized.TestCase):
       self.skipTest('Saving subclassed models with the HDF5 format is '
                     'unsupported')
     if (save_format == 'tf' and model_type == 'subclass' and
-        not testing_utils.should_run_tf_function()):
-      self.skipTest('b/142352416: This combination of features is currently '
+        not context.executing_eagerly()):
+      self.skipTest('b/148820505: This combination of features is currently '
                     'broken.')
 
   @keras_parameterized.run_with_all_model_types
@@ -494,11 +492,10 @@ class KerasModelTest(keras_parameterized.TestCase):
           'save_format': 'h5',
           'use_regularizer': True,
       }, {
-          # TODO(b/148874820): Test saving a model with CentralStorageStrategy.
-          # Currently this doesn't work even for float32.
           'testcase_name': 'central_storage',
           'strategy_fn': create_central_storage_strategy,
           'use_regularizer': True,
+          'save_format': 'tf'
       }, {
           'testcase_name': 'norun_distributed',
           'strategy_fn': create_mirrored_strategy,
@@ -513,7 +510,7 @@ class KerasModelTest(keras_parameterized.TestCase):
                  save_format=None,
                  use_input_spec=False,
                  experimental_run_tf_function=True):
-    self._skip_if_strategy_unsupported(strategy_fn, check_model_type=True)
+    self._skip_if_strategy_unsupported(strategy_fn)
     self._skip_if_save_format_unsupported(save_format)
     regularizer = (mp_test_util.IdentityRegularizer() if use_regularizer
                    else None)
@@ -620,7 +617,6 @@ class KerasModelTest(keras_parameterized.TestCase):
                               strategy_fn,
                               experimental_run_tf_function=True):
     # Note: We do not test mixed precision in this method, only loss scaling.
-    self._skip_if_strategy_unsupported(strategy_fn)
     loss_scale = 8.
     batch_size = 4
     with strategy_fn().scope():
@@ -679,7 +675,6 @@ class KerasModelTest(keras_parameterized.TestCase):
     #  * Regularization on some variables and not others.
     #  * A fixed loss scale (if use_loss_scaling is True)
 
-    self._skip_if_strategy_unsupported(strategy_fn)
     strategy = strategy_fn()
     if use_loss_scaling:
       loss_scale = 8.
@@ -779,7 +774,6 @@ class KerasModelTest(keras_parameterized.TestCase):
                                 pass_loss_scale_to_policy=False,
                                 get_config=False,
                                 experimental_run_tf_function=True):
-    self._skip_if_strategy_unsupported(strategy_fn)
     strategy = strategy_fn()
     initial_loss_scale = 2.
     batch_size = 4
@@ -956,7 +950,6 @@ class KerasModelTest(keras_parameterized.TestCase):
   def test_save_slot_variables_with_autocast_vars(self,
                                                   strategy_fn,
                                                   var_name='v'):
-    self._skip_if_strategy_unsupported(strategy_fn)
     p = policy.Policy('mixed_float16', loss_scale=None)
     with strategy_fn().scope(), policy.policy_scope(p):
       x = layers.Input(shape=(2,), batch_size=2)
@@ -992,7 +985,6 @@ class KerasModelTest(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   @parameterized.named_parameters(*TESTCASES)
   def test_save_weights_with_dynamic_loss_scaling(self, strategy_fn):
-    self._skip_if_strategy_unsupported(strategy_fn)
     strategy = strategy_fn()
     if (isinstance(strategy, mirrored_strategy.MirroredStrategy) and
         not context.executing_eagerly()):
@@ -1051,7 +1043,6 @@ class KerasModelTest(keras_parameterized.TestCase):
           'h5': True,
       })
   def test_save_model_with_dynamic_loss_scaling(self, strategy_fn, h5=False):
-    self._skip_if_strategy_unsupported(strategy_fn)
     # TODO(reedwm): Support and test saving model with a mixed_[b]float16 policy
     # as well.
     strategy = strategy_fn()

From b18833b60c2d684c67b6b0c1f51d6f23bc13d434 Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Wed, 19 Feb 2020 14:29:10 -0800
Subject: [PATCH 272/442] Make use of GetDataDependencyFilepath and JoinPath to
 build paths which will work across operating systems.

The previous implementation doesn't work correctly on Windows.

PiperOrigin-RevId: 296058125
Change-Id: I516774d9f45fb1b5f73d684416f98d91ab283266
---
 tensorflow/core/platform/cloud/BUILD             |  1 +
 .../platform/cloud/google_auth_provider_test.cc  | 16 ++++++++--------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 7b194e78911..21e826242f9 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -450,6 +450,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:path",
+        "//tensorflow/core/platform:resource_loader",
     ],
 )
 
diff --git a/tensorflow/core/platform/cloud/google_auth_provider_test.cc b/tensorflow/core/platform/cloud/google_auth_provider_test.cc
index 5bee2072034..4f13750dcfd 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider_test.cc
+++ b/tensorflow/core/platform/cloud/google_auth_provider_test.cc
@@ -20,13 +20,16 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/cloud/http_request_fake.h"
 #include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 
 namespace {
 
-constexpr char kTestData[] = "core/platform/cloud/testdata/";
+string TestData() {
+  return io::JoinPath("tensorflow", "core", "platform", "cloud", "testdata");
+}
 
 class FakeEnv : public EnvWrapper {
  public:
@@ -80,13 +83,11 @@ class GoogleAuthProviderTest : public ::testing::Test {
 
 TEST_F(GoogleAuthProviderTest, EnvironmentVariable_Caching) {
   setenv("GOOGLE_APPLICATION_CREDENTIALS",
-         io::JoinPath(
-             io::JoinPath(testing::TensorFlowSrcRoot(), kTestData).c_str(),
-             "service_account_credentials.json")
+         GetDataDependencyFilepath(
+             io::JoinPath(TestData(), "service_account_credentials.json"))
              .c_str(),
          1);
-  setenv("CLOUDSDK_CONFIG",
-         io::JoinPath(testing::TensorFlowSrcRoot(), kTestData).c_str(),
+  setenv("CLOUDSDK_CONFIG", GetDataDependencyFilepath(TestData()).c_str(),
          1);  // Will not be used.
 
   auto oauth_client = new FakeOAuthClient;
@@ -123,8 +124,7 @@ TEST_F(GoogleAuthProviderTest, EnvironmentVariable_Caching) {
 }
 
 TEST_F(GoogleAuthProviderTest, GCloudRefreshToken) {
-  setenv("CLOUDSDK_CONFIG",
-         io::JoinPath(testing::TensorFlowSrcRoot(), kTestData).c_str(), 1);
+  setenv("CLOUDSDK_CONFIG", GetDataDependencyFilepath(TestData()).c_str(), 1);
 
   auto oauth_client = new FakeOAuthClient;
   std::vector<HttpRequest*> requests;

From ccfb01ed6da10fad0bf0a449f74625470fbcf8b0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 14:42:21 -0800
Subject: [PATCH 273/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 296061050 Change-Id:
 Ia8dfa50365e171f2deb0f22318cfbe75a1a1c9b1

---
 tensorflow/go/op/wrappers.go | 61 +++++++++++++++++++++++++++++++-----
 1 file changed, 53 insertions(+), 8 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index f69affe5e8a..449a95765a5 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -44948,17 +44948,62 @@ func InfeedEnqueue(scope *Scope, input tf.Output, optional ...InfeedEnqueueAttr)
 	return scope.AddOperation(opspec)
 }
 
-// A dataset that creates window datasets from the input dataset.
+//   Combines (nests of) input elements into a dataset of (nests of) windows.
+//
+//   A "window" is a finite dataset of flat elements of size `size` (or possibly
+//   fewer if there are not enough input elements to fill the window and
+//   `drop_remainder` evaluates to false).
+//
+//   The `shift` argument determines the number of input elements by which
+//   the window moves on each iteration.  The first element in the `k`th window
+//   will be element
+//
+//   ```
+//   1 + (k-1) * shift
+//   ```
+//
+//   of the input dataset. In particular, the first element of the first window
+//   will always be the first element of the input dataset.
+//
+//   If the `stride` parameter is greater than 1, then each window will skip
+//   `(stride - 1)` input elements between each element that appears in the
+//   window. Output windows will still contain `size` elements regardless of
+//   the value of `stride`.
+//
+//   The `stride` argument determines the stride of the input elements, and the
+//   `shift` argument determines the shift of the window.
+//
+//   For example, letting `{...}` to represent a Dataset:
+//
+//   - `tf.data.Dataset.range(7).window(2)` produces
+//     `{{0, 1}, {2, 3}, {4, 5}, {6}}`
+//   - `tf.data.Dataset.range(7).window(3, 2, 1, True)` produces
+//     `{{0, 1, 2}, {2, 3, 4}, {4, 5, 6}}`
+//   - `tf.data.Dataset.range(7).window(3, 1, 2, True)` produces
+//     `{{0, 2, 4}, {1, 3, 5}, {2, 4, 6}}`
+//
+//   Note that when the `window` transformation is applied to a dataset of
+//   nested elements, it produces a dataset of nested windows.
+//
+//   For example:
+//
+//   - `tf.data.Dataset.from_tensor_slices((range(4), range(4))).window(2)`
+//     produces `{({0, 1}, {0, 1}), ({2, 3}, {2, 3})}`
+//   - `tf.data.Dataset.from_tensor_slices({"a": range(4)}).window(2)`
+//     produces `{{"a": {0, 1}}, {"a": {2, 3}}}`
 //
 // Arguments:
 //
-//	size: A scalar representing the number of elements to accumulate in a window.
-//	shift: A scalar representing the steps moving the sliding window forward in one
-// iteration. It must be positive.
-//	stride: A scalar representing the stride of the input elements of the sliding window.
-// It must be positive.
-//	drop_remainder: A scalar representing whether a window should be dropped in case its size is
-// smaller than desired.
+//	size: An integer scalar, representing the number of elements
+// of the input dataset to combine into a window. Must be positive.
+//	shift: An integer scalar, representing the number of input elements
+// by which the window moves in each iteration.  Defaults to `size`.
+// Must be positive.
+//	stride: An integer scalar, representing the stride of the input elements
+// in the sliding window. Must be positive. The default value of 1 means
+// "retain every input element".
+//	drop_remainder: A Boolean scalar, representing whether the last window should be
+// dropped if its size is smaller than `window_size`.
 //
 //
 func WindowDataset(scope *Scope, input_dataset tf.Output, size tf.Output, shift tf.Output, stride tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {

From 3cdc5de1060c9362d296985f5f958c5f810b83dd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 14:46:03 -0800
Subject: [PATCH 274/442] Adds additional error logging. Otherwise this ends up
 producing opaque messages like "Failed to evaluate the model.", when the
 problem is just label file doesn't exist.

PiperOrigin-RevId: 296061984
Change-Id: I394be0a1f6219879613ad11b794eefa9fb3d8dcd
---
 .../lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc  | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
index 0e0c7786cbf..6fbd18d6c2b 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
@@ -210,8 +210,12 @@ TfLiteStatus ImagenetModelEvaluator::EvaluateModel() const {
       tflite::evaluation::GetSortedFileNames(data_path, &image_files));
   std::vector<string> ground_truth_image_labels;
   if (!tflite::evaluation::ReadFileLines(params_.ground_truth_labels_path,
-                                         &ground_truth_image_labels))
+                                         &ground_truth_image_labels)) {
+    LOG(ERROR) << "Unable to read ground truth labels from: "
+               << params_.ground_truth_labels_path
+               << " Perhaps file doesn't exist or is unreadable.";
     return kTfLiteError;
+  }
   if (image_files.size() != ground_truth_image_labels.size()) {
     LOG(ERROR) << "Images and ground truth labels don't match";
     return kTfLiteError;

From 07827aafe797e6e47b5a34f19c2431781f8c3136 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 14:47:35 -0800
Subject: [PATCH 275/442] Change required numpy version to >= 1.16

PiperOrigin-RevId: 296062347
Change-Id: I416e4429606f130da6de08719da5693b82f0dcf7
---
 tensorflow/lite/tools/pip_package/setup.py  | 2 +-
 tensorflow/tools/ci_build/release/common.sh | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/tools/pip_package/setup.py b/tensorflow/lite/tools/pip_package/setup.py
index 809f7149a6f..19c9993e5fa 100644
--- a/tensorflow/lite/tools/pip_package/setup.py
+++ b/tensorflow/lite/tools/pip_package/setup.py
@@ -201,7 +201,7 @@ setup(
     packages=find_packages(exclude=[]),
     ext_modules=[ext],
     install_requires=[
-        'numpy >= 1.12.1',
+        'numpy >= 1.16.0',
     ],
     cmdclass={
         'build_ext': CustomBuildExt,
diff --git a/tensorflow/tools/ci_build/release/common.sh b/tensorflow/tools/ci_build/release/common.sh
index e328a2f94a6..2f111694dd2 100644
--- a/tensorflow/tools/ci_build/release/common.sh
+++ b/tensorflow/tools/ci_build/release/common.sh
@@ -152,6 +152,7 @@ function install_pip_deps {
   # TODO(aselle): Change all these to be --user instead of sudo.
   ${SUDO_CMD} ${PIP_CMD} install astunparse==1.6.3
   ${SUDO_CMD} ${PIP_CMD} install keras_preprocessing==1.1.0 --no-deps
+  "${PIP_CMD}" install numpy==1.16.0 --user
   ${SUDO_CMD} ${PIP_CMD} install gast==0.3.3
   ${SUDO_CMD} ${PIP_CMD} install h5py==2.10.0
   ${SUDO_CMD} ${PIP_CMD} install six==1.12.0
@@ -183,7 +184,7 @@ function install_ubuntu_16_pip_deps {
   "${PIP_CMD}" install astunparse==1.6.3 --user
   "${PIP_CMD}" install --user --upgrade attrs
   "${PIP_CMD}" install keras_preprocessing==1.1.0 --no-deps --user
-  "${PIP_CMD}" install numpy==1.14.5 --user
+  "${PIP_CMD}" install numpy==1.16.0 --user
   "${PIP_CMD}" install --user --upgrade "future>=0.17.1"
   "${PIP_CMD}" install gast==0.3.3 --user
   "${PIP_CMD}" install h5py==2.10.0 --user
@@ -228,7 +229,7 @@ function install_macos_pip_deps {
   ${SUDO_CMD} ${PIP_CMD} install --upgrade mock portpicker scipy grpcio
   ${SUDO_CMD} ${PIP_CMD} install six==1.12.0
   ${SUDO_CMD} ${PIP_CMD} install scikit-learn
-  ${SUDO_CMD} ${PIP_CMD} install numpy==1.14.5
+  ${SUDO_CMD} ${PIP_CMD} install numpy==1.16.0
   ${SUDO_CMD} ${PIP_CMD} install gast==0.3.3
   ${SUDO_CMD} ${PIP_CMD} install h5py==2.10.0
   ${SUDO_CMD} ${PIP_CMD} install --upgrade grpcio

From 439595440b378c2b87c4a0159e86e5ba694687c9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 14:57:57 -0800
Subject: [PATCH 276/442] Export public symbols for on demand profiling APIs.

PiperOrigin-RevId: 296064710
Change-Id: I3a3b549fb59fe9ecbfc6c07ba809b0c1732932e4
---
 tensorflow/python/BUILD                       |  1 +
 tensorflow/python/__init__.py                 |  1 +
 tensorflow/python/profiler/BUILD              |  1 +
 tensorflow/python/profiler/profiler_client.py | 76 +++++++++++++++++--
 .../tools/api/generator/api_init_files.bzl    |  1 +
 ...sorflow.profiler.experimental.client.pbtxt | 11 +++
 .../v2/tensorflow.profiler.experimental.pbtxt |  4 +
 7 files changed, 88 insertions(+), 7 deletions(-)
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.client.pbtxt

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 583d16e7b26..15d21d34bc5 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -204,6 +204,7 @@ py_library(
         "//tensorflow/python/ops/ragged",
         "//tensorflow/python/ops/signal",
         "//tensorflow/python/profiler",
+        "//tensorflow/python/profiler:profiler_client",
         "//tensorflow/python/profiler:profiler_v2",
         "//tensorflow/python/saved_model",
         "//tensorflow/python/tools:module_util",
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 7a9eac7931e..6d88cb566ae 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -111,6 +111,7 @@ from tensorflow.python.ops.linalg.sparse import sparse
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.ops.signal import signal
 from tensorflow.python.profiler import profiler
+from tensorflow.python.profiler import profiler_client
 from tensorflow.python.profiler import profiler_v2
 from tensorflow.python.saved_model import saved_model
 from tensorflow.python.summary import summary
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index 6c2abbd1f4b..2566b8b48c6 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -26,6 +26,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:c_api_util",
+        "//tensorflow/python:util",
         "//tensorflow/python/profiler/internal:_pywrap_profiler",
     ],
 )
diff --git a/tensorflow/python/profiler/profiler_client.py b/tensorflow/python/profiler/profiler_client.py
index d67c275aebf..d8856c48c53 100644
--- a/tensorflow/python/profiler/profiler_client.py
+++ b/tensorflow/python/profiler/profiler_client.py
@@ -20,7 +20,12 @@ from __future__ import print_function
 
 from tensorflow.python.profiler.internal import _pywrap_profiler
 
+from tensorflow.python.util.tf_export import tf_export
 
+_GRPC_PREFIX = 'grpc://'
+
+
+@tf_export('profiler.experimental.client.trace', v1=[])
 def trace(service_addr,
           logdir,
           duration_ms,
@@ -28,10 +33,15 @@ def trace(service_addr,
           num_tracing_attempts=3):
   """Sends grpc requests to profiler server to perform on-demand profiling.
 
-  This method will block caller thread until receives tracing result.
+  This method will block caller thread until it receives tracing result. This
+  method supports CPU, GPU, and Cloud TPU. This method supports profiling a
+  single host for CPU, GPU, TPU, as well as multiple TPU workers.
+  The profiled results will be saved to your specified TensorBoard log
+  directory (e.g. the directory you save your model checkpoints). Use the
+  TensorBoard profile plugin to view the visualization and analysis results.
 
   Args:
-    service_addr: Address of profiler service e.g. localhost:6009.
+    service_addr: gRPC address of profiler service e.g. grpc://localhost:6009.
     logdir: Path of TensorBoard log directory e.g. /tmp/tb_log.
     duration_ms: Duration of tracing or monitoring in ms.
     worker_list: Optional. The list of workers that we are about to profile in
@@ -41,23 +51,75 @@ def trace(service_addr,
 
   Raises:
     UnavailableError: If no trace event is collected.
+
+  Example usage (CPU/GPU):
+  # Start a profiler server before your model runs.
+  ```python
+  tf.profiler.experimental.server.start(6009)
+  # your model code.
+  # Send gRPC request to the profiler server to collect a trace of your model.
+  ```python
+  tf.profiler.experimental.client.trace('grpc://localhost:6009',
+                                        '/tmp/tb_log', 2000)
+
+  Example usage (TPU):
+  # Send gRPC request to a TPU worker to collect a trace of your model. A
+  # profiler service has been started in the TPU worker at port 8466.
+  ```python
+  # E.g. your TPU IP address is 10.0.0.2 and you want to profile for 2 seconds.
+  tf.profiler.experimental.client.trace('grpc://10.0.0.2:8466',
+                                        'gs://your_tb_dir', 2000)
+
+  Example usage (Multiple TPUs):
+  # Send gRPC request to a TPU pod to collect a trace of your model on multiple
+  # TPUs. A profiler service has been started in all the TPU workers at the
+  # port 8466.
+  ```python
+  # E.g. your TPU IP addresses are 10.0.0.2, 10.0.0.3, 10.0.0.4, and you want to
+  # profile for 2 seconds.
+  tf.profiler.experimental.client.trace('grpc://10.0.0.2:8466',
+                                        'gs://your_tb_dir',
+                                        2000, '10.0.0.3,10.0.0.4')
+
+  Launch TensorBoard and point it to the same logdir you provided to this API.
+  $ tensorboard --logdir=/tmp/tb_log (or gs://your_tb_dir in the above examples)
+  Open your browser and go to localhost:6006/#profile to view profiling results.
+
   """
-  _pywrap_profiler.trace(service_addr, logdir, worker_list, True, duration_ms,
-                         num_tracing_attempts)
+  _pywrap_profiler.trace(
+      _strip_prefix(service_addr, _GRPC_PREFIX), logdir, worker_list, True,
+      duration_ms, num_tracing_attempts)
 
 
+@tf_export('profiler.experimental.client.monitor', v1=[])
 def monitor(service_addr, duration_ms, level=1):
   """Sends grpc requests to profiler server to perform on-demand monitoring.
 
-  This method will block caller thread until receives monitoring result.
+  The monitoring result is a light weight performance summary of your model
+  execution. This method will block the caller thread until it receives the
+  monitoring result. This method currently supports Cloud TPU only.
 
   Args:
-    service_addr: Address of profiler service e.g. localhost:6009.
+    service_addr: gRPC address of profiler service e.g. grpc://10.0.0.2:8466.
     duration_ms: Duration of monitoring in ms.
     level: Choose a monitoring level between 1 and 2 to monitor your job. Level
       2 is more verbose than level 1 and shows more metrics.
 
   Returns:
     A string of monitoring output.
+
+  Example usage:
+  # Continuously send gRPC requests to the Cloud TPU to monitor the model
+  # execution.
+  ```python
+  for query in range(0, 100):
+    print(tf.profiler.experimental.client.monitor('grpc://10.0.0.2:8466', 1000))
+
+
   """
-  return _pywrap_profiler.monitor(service_addr, duration_ms, level, True)
+  return _pywrap_profiler.monitor(
+      _strip_prefix(service_addr, _GRPC_PREFIX), duration_ms, level, True)
+
+
+def _strip_prefix(s, prefix):
+  return s[len(prefix):] if s.startswith(prefix) else s
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index 8542c745bb4..3aab59e50aa 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -50,6 +50,7 @@ TENSORFLOW_API_INIT_FILES = [
     "nn/__init__.py",
     "profiler/__init__.py",
     "profiler/experimental/__init__.py",
+    "profiler/experimental/client/__init__.py",
     "profiler/experimental/server/__init__.py",
     "quantization/__init__.py",
     "ragged/__init__.py",
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.client.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.client.pbtxt
new file mode 100644
index 00000000000..4b44f126be8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.client.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.profiler.experimental.client"
+tf_module {
+  member_method {
+    name: "monitor"
+    argspec: "args=[\'service_addr\', \'duration_ms\', \'level\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'service_addr\', \'logdir\', \'duration_ms\', \'worker_list\', \'num_tracing_attempts\'], varargs=None, keywords=None, defaults=[\'\', \'3\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.pbtxt
index 9c503abf268..2823f422b85 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.profiler.experimental.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "Profile"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "client"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "server"
     mtype: "<type \'module\'>"

From 09fe958feebec0405ccac225c94fc130304fc2f4 Mon Sep 17 00:00:00 2001
From: Jakob Buchgraber <buchgr@google.com>
Date: Wed, 19 Feb 2020 14:59:49 -0800
Subject: [PATCH 277/442] Enable Remote Config for ROCM and CUDA RBE pre- and
 postsubmits

Previously TF_CUDA_CONFIG_REPO would point to a pregenerated and checked in configuration. This changes has it point to a remote repository intead that generates the configuration during the build for the specific docker image. All supported configurations can be found in third_party/toolchains/remote_config/configs.bzl. Each tensorflow_rbe_config() macro creates a few remote repositories to which to point the TF_*_CONFIG_REPO environment variables to. The remote repository names are prefixed with the macro's name. For example, tensorflow_rbe_config(name = "ubuntu") will create @ubuntu_config_python, @ubuntu_config_cuda, @ubuntu_config_nccl, etc.

This change also introduces the platform_configure. All this rule does is create a remote repository with a single platform target for the tensorflow_rbe_config(). This will make the platforms defined in //third_party/toolchains/BUILD obsolete once remote config is fully rolled out.

PiperOrigin-RevId: 296065144
Change-Id: Ia54beeb771b28846444e27a2023f70abbd9f6ad5
---
 .bazelrc                                      |   4 +
 tensorflow/opensource_only.files              |   6 +
 .../ubuntu_16/gpu_py36_full/build.sh          |  24 +++-
 tensorflow/workspace.bzl                      |   4 +
 third_party/gpus/cuda_configure.bzl           |  69 +++++++---
 third_party/gpus/rocm_configure.bzl           |  48 +++++--
 third_party/nccl/nccl_configure.bzl           |  29 ++--
 third_party/py/python_configure.bzl           |  22 ++-
 third_party/remote_config/BUILD.tpl           |  11 ++
 .../remote_platform_configure.bzl             |  17 +++
 third_party/tensorrt/tensorrt_configure.bzl   |  24 +++-
 third_party/toolchains/remote_config/BUILD    |   0
 .../toolchains/remote_config/configs.bzl      |  24 ++++
 .../toolchains/remote_config/containers.bzl   |  20 +++
 .../toolchains/remote_config/rbe_config.bzl   | 125 ++++++++++++++++++
 15 files changed, 365 insertions(+), 62 deletions(-)
 create mode 100644 third_party/remote_config/BUILD.tpl
 create mode 100644 third_party/remote_config/remote_platform_configure.bzl
 create mode 100644 third_party/toolchains/remote_config/BUILD
 create mode 100644 third_party/toolchains/remote_config/configs.bzl
 create mode 100644 third_party/toolchains/remote_config/containers.bzl
 create mode 100644 third_party/toolchains/remote_config/rbe_config.bzl

diff --git a/.bazelrc b/.bazelrc
index 5f9173b9d36..2b80063fd59 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -319,6 +319,10 @@ build:xla --define=with_xla_support=true
 # BEGIN TF REMOTE BUILD EXECUTION OPTIONS
 # Options when using remote execution
 # WARNING: THESE OPTIONS WONT WORK IF YOU DO NOT HAVE PROPER AUTHENTICATION AND PERMISSIONS
+
+# Flag to enable remote config
+common --experimental_repo_remote_exec
+
 build:rbe --action_env=BAZEL_DO_NOT_DETECT_CPP_TOOLCHAIN=1
 build:rbe --google_default_credentials
 build:rbe --bes_backend=buildeventservice.googleapis.com
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 4d39efad106..026f2675737 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -149,7 +149,9 @@ tensorflow/third_party/py/python_configure.bzl
 tensorflow/third_party/pybind11.BUILD
 tensorflow/third_party/python_runtime/BUILD
 tensorflow/third_party/remote_config/BUILD
+tensorflow/third_party/remote_config/BUILD.tpl
 tensorflow/third_party/remote_config/common.bzl
+tensorflow/third_party/remote_config/remote_platform_configure.bzl
 tensorflow/third_party/repo.bzl
 tensorflow/third_party/six.BUILD
 tensorflow/third_party/snappy.BUILD
@@ -280,6 +282,10 @@ tensorflow/third_party/toolchains/remote/BUILD
 tensorflow/third_party/toolchains/remote/BUILD.tpl
 tensorflow/third_party/toolchains/remote/configure.bzl
 tensorflow/third_party/toolchains/remote/execution.bzl.tpl
+tensorflow/third_party/toolchains/remote_config/BUILD
+tensorflow/third_party/toolchains/remote_config/configs.bzl
+tensorflow/third_party/toolchains/remote_config/containers.bzl
+tensorflow/third_party/toolchains/remote_config/rbe_config.bzl
 tensorflow/third_party/wrapt.BUILD
 tensorflow/third_party/zlib.BUILD
 tensorflow/tools/ci_build/release/common.sh
diff --git a/tensorflow/tools/ci_build/presubmit/ubuntu_16/gpu_py36_full/build.sh b/tensorflow/tools/ci_build/presubmit/ubuntu_16/gpu_py36_full/build.sh
index 935db96add1..1498063630a 100644
--- a/tensorflow/tools/ci_build/presubmit/ubuntu_16/gpu_py36_full/build.sh
+++ b/tensorflow/tools/ci_build/presubmit/ubuntu_16/gpu_py36_full/build.sh
@@ -50,6 +50,13 @@ function run_build () {
   # Get the default test targets for bazel.
   source tensorflow/tools/ci_build/build_scripts/PRESUBMIT_BUILD_TARGETS.sh
 
+  RBE_CONFIG="@ubuntu16.04-py3-gcc7_manylinux2010-cuda10.0-cudnn7-tensorrt5.1"
+  TF_CUDA_CONFIG_REPO="${RBE_CONFIG}_config_cuda"
+  TF_TENSORRT_CONFIG_REPO="${RBE_CONFIG}_config_tensorrt"
+  TF_PYTHON_CONFIG_REPO="${RBE_CONFIG}_config_python"
+  TF_NCCL_CONFIG_REPO="${RBE_CONFIG}_config_nccl"
+  TF_RBE_PLATFORM="${RBE_CONFIG}_config_platform//:platform"
+
   # Run bazel test command. Double test timeouts to avoid flakes.
   # //tensorflow/core/platform:setround_test is not supported. See b/64264700
   # TODO(klimek): Re-enable tensorrt tests (with different runtime image) once
@@ -65,12 +72,14 @@ function run_build () {
     --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
     --action_env=REMOTE_GPU_TESTING=1 \
     --action_env=TF_CUDA_COMPUTE_CAPABILITIES="${TF_CUDA_COMPUTE_CAPABILITIES}" \
-    --action_env=TF_CUDA_CONFIG_REPO=@org_tensorflow//third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7 \
+    --action_env=TF_CUDA_CONFIG_REPO="${TF_CUDA_CONFIG_REPO}" \
     --action_env=TF_CUDA_VERSION=10 \
     --action_env=TF_CUDNN_VERSION=7 \
     --action_env=TF_NEED_TENSORRT=0 \
+    --action_env=TF_TENSORRT_CONFIG_REPO="${TF_TENSORRT_CONFIG_REPO}" \
     --action_env=TF_NEED_CUDA=1 \
-    --action_env=TF_PYTHON_CONFIG_REPO=@org_tensorflow//third_party/toolchains/preconfig/ubuntu16.04/py3 \
+    --action_env=TF_PYTHON_CONFIG_REPO="${TF_PYTHON_CONFIG_REPO}" \
+    --action_env=TF_NCCL_CONFIG_REPO="${TF_NCCL_CONFIG_REPO}" \
     --test_env=LD_LIBRARY_PATH \
     --test_tag_filters="${tag_filters}" \
     --build_tag_filters="${tag_filters}" \
@@ -89,17 +98,17 @@ function run_build () {
     --linkopt=-lm \
     --distinct_host_configuration=false \
     --remote_default_exec_properties=build=${CACHE_SILO_VAL} \
-    --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0:toolchain \
+    --crosstool_top="${TF_CUDA_CONFIG_REPO}//crosstool:toolchain" \
     --host_javabase=@bazel_toolchains//configs/ubuntu16_04_clang/1.1:jdk8 \
     --javabase=@bazel_toolchains//configs/ubuntu16_04_clang/1.0:jdk8 \
     --host_java_toolchain=@bazel_tools//tools/jdk:toolchain_hostjdk8 \
     --java_toolchain=@bazel_tools//tools/jdk:toolchain_hostjdk8 \
-    --extra_toolchains=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0:toolchain-linux-x86_64 \
-    --extra_execution_platforms=@org_tensorflow//third_party/toolchains:rbe_cuda10.0-cudnn7-ubuntu16.04-manylinux2010 \
-    --host_platform=@org_tensorflow//third_party/toolchains:rbe_cuda10.0-cudnn7-ubuntu16.04-manylinux2010 \
+    --extra_toolchains="${TF_CUDA_CONFIG_REPO}//crosstool:toolchain-linux-x86_64" \
+    --extra_execution_platforms="${TF_RBE_PLATFORM}" \
+    --host_platform="${TF_RBE_PLATFORM}" \
     --local_test_jobs=4 \
     --remote_timeout=3600 \
-    --platforms=@org_tensorflow//third_party/toolchains:rbe_cuda10.0-cudnn7-ubuntu16.04-manylinux2010 \
+    --platforms="${TF_RBE_PLATFORM}" \
     -- \
     ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
 
@@ -113,3 +122,4 @@ install_bazelisk
 which bazel
 
 run_build
+
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index c7160a9ffbd..95a9afa9d5a 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -41,6 +41,7 @@ load("//third_party/psimd:workspace.bzl", psimd = "repo")
 load("//third_party/pthreadpool:workspace.bzl", pthreadpool = "repo")
 load("//third_party/sobol_data:workspace.bzl", sobol_data = "repo")
 load("//third_party/vulkan_headers:workspace.bzl", vulkan_headers = "repo")
+load("//third_party/toolchains/remote_config:configs.bzl", "initialize_rbe_configs")
 
 def initialize_third_party():
     """ Load third party repositories.  See above load() statements. """
@@ -81,6 +82,9 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 def tf_repositories(path_prefix = "", tf_repo_name = ""):
     """All external dependencies for TF builds."""
 
+    # Loads all external repos to configure RBE builds.
+    initialize_rbe_configs()
+
     # Note that we check the minimum bazel version in WORKSPACE.
     clang6_configure(name = "local_config_clang6")
     cc_download_clang_toolchain(name = "local_config_download_clang")
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index c28cbbac2ea..bdaaa4ab250 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -1174,6 +1174,24 @@ def _create_remote_cuda_repository(repository_ctx, remote_config_repo):
         {},
     )
 
+    repository_ctx.template(
+        "crosstool/BUILD",
+        config_repo_label(remote_config_repo, "crosstool:BUILD"),
+        {},
+    )
+
+    repository_ctx.template(
+        "crosstool/cc_toolchain_config.bzl",
+        config_repo_label(remote_config_repo, "crosstool:cc_toolchain_config.bzl"),
+        {},
+    )
+
+    repository_ctx.template(
+        "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
+        config_repo_label(remote_config_repo, "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc"),
+        {},
+    )
+
 def _cuda_autoconf_impl(repository_ctx):
     """Implementation of the cuda_autoconf repository rule."""
     if not enable_cuda(repository_ctx):
@@ -1191,29 +1209,38 @@ def _cuda_autoconf_impl(repository_ctx):
     else:
         _create_local_cuda_repository(repository_ctx)
 
-cuda_configure = repository_rule(
-    implementation = _cuda_autoconf_impl,
-    environ = [
-        _GCC_HOST_COMPILER_PATH,
-        _GCC_HOST_COMPILER_PREFIX,
-        _CLANG_CUDA_COMPILER_PATH,
-        "TF_NEED_CUDA",
-        "TF_CUDA_CLANG",
-        _TF_DOWNLOAD_CLANG,
-        _CUDA_TOOLKIT_PATH,
-        _CUDNN_INSTALL_PATH,
-        _TF_CUDA_VERSION,
-        _TF_CUDNN_VERSION,
-        _TF_CUDA_COMPUTE_CAPABILITIES,
-        _TF_CUDA_CONFIG_REPO,
-        "NVVMIR_LIBRARY_DIR",
-        _PYTHON_BIN_PATH,
-        "TMP",
-        "TMPDIR",
-        "TF_CUDA_PATHS",
-    ],
+_ENVIRONS = [
+    _GCC_HOST_COMPILER_PATH,
+    _GCC_HOST_COMPILER_PREFIX,
+    _CLANG_CUDA_COMPILER_PATH,
+    "TF_NEED_CUDA",
+    "TF_CUDA_CLANG",
+    _TF_DOWNLOAD_CLANG,
+    _CUDA_TOOLKIT_PATH,
+    _CUDNN_INSTALL_PATH,
+    _TF_CUDA_VERSION,
+    _TF_CUDNN_VERSION,
+    _TF_CUDA_COMPUTE_CAPABILITIES,
+    "NVVMIR_LIBRARY_DIR",
+    _PYTHON_BIN_PATH,
+    "TMP",
+    "TMPDIR",
+    "TF_CUDA_PATHS",
+]
+
+remote_cuda_configure = repository_rule(
+    implementation = _create_local_cuda_repository,
+    environ = _ENVIRONS,
+    remotable = True,
+    attrs = {
+        "environ": attr.string_dict(),
+    },
 )
 
+cuda_configure = repository_rule(
+    implementation = _cuda_autoconf_impl,
+    environ = _ENVIRONS + [_TF_CUDA_CONFIG_REPO],
+)
 """Detects and configures the local CUDA toolchain.
 
 Add the following to your WORKSPACE FILE:
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index e26e9b485b1..20ff2a4aafa 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -811,6 +811,21 @@ def _create_remote_rocm_repository(repository_ctx, remote_config_repo):
         config_repo_label(remote_config_repo, "rocm:rocm/rocm_config.h"),
         {},
     )
+    repository_ctx.template(
+        "crosstool/BUILD",
+        config_repo_label(remote_config_repo, "crosstool:BUILD"),
+        {},
+    )
+    repository_ctx.template(
+        "crosstool/cc_toolchain_config.bzl",
+        config_repo_label(remote_config_repo, "crosstool:cc_toolchain_config.bzl"),
+        {},
+    )
+    repository_ctx.template(
+        "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
+        config_repo_label(remote_config_repo, "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc"),
+        {},
+    )
 
 def _rocm_autoconf_impl(repository_ctx):
     """Implementation of the rocm_autoconf repository rule."""
@@ -824,20 +839,29 @@ def _rocm_autoconf_impl(repository_ctx):
     else:
         _create_local_rocm_repository(repository_ctx)
 
-rocm_configure = repository_rule(
-    implementation = _rocm_autoconf_impl,
-    environ = [
-        _GCC_HOST_COMPILER_PATH,
-        _GCC_HOST_COMPILER_PREFIX,
-        "TF_NEED_ROCM",
-        _ROCM_TOOLKIT_PATH,
-        _TF_ROCM_VERSION,
-        _TF_MIOPEN_VERSION,
-        _TF_ROCM_AMDGPU_TARGETS,
-        _TF_ROCM_CONFIG_REPO,
-    ],
+_ENVIRONS = [
+    _GCC_HOST_COMPILER_PATH,
+    _GCC_HOST_COMPILER_PREFIX,
+    "TF_NEED_ROCM",
+    _ROCM_TOOLKIT_PATH,
+    _TF_ROCM_VERSION,
+    _TF_MIOPEN_VERSION,
+    _TF_ROCM_AMDGPU_TARGETS,
+]
+
+remote_rocm_configure = repository_rule(
+    implementation = _create_local_rocm_repository,
+    environ = _ENVIRONS,
+    remotable = True,
+    attrs = {
+        "environ": attr.string_dict(),
+    },
 )
 
+rocm_configure = repository_rule(
+    implementation = _rocm_autoconf_impl,
+    environ = _ENVIRONS + [_TF_ROCM_CONFIG_REPO],
+)
 """Detects and configures the local ROCm toolchain.
 
 Add the following to your WORKSPACE FILE:
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index f05ef7e7a6e..92acb204097 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -139,17 +139,28 @@ def _nccl_autoconf_impl(repository_ctx):
     else:
         _create_local_nccl_repository(repository_ctx)
 
+_ENVIRONS = [
+    _CUDA_TOOLKIT_PATH,
+    _NCCL_HDR_PATH,
+    _NCCL_INSTALL_PATH,
+    _TF_NCCL_VERSION,
+    _TF_CUDA_COMPUTE_CAPABILITIES,
+    _TF_NEED_CUDA,
+    "TF_CUDA_PATHS",
+]
+
+remote_nccl_configure = repository_rule(
+    implementation = _create_local_nccl_repository,
+    environ = _ENVIRONS,
+    remotable = True,
+    attrs = {
+        "environ": attr.string_dict(),
+    },
+)
+
 nccl_configure = repository_rule(
     implementation = _nccl_autoconf_impl,
-    environ = [
-        _CUDA_TOOLKIT_PATH,
-        _NCCL_HDR_PATH,
-        _NCCL_INSTALL_PATH,
-        _TF_NCCL_VERSION,
-        _TF_CUDA_COMPUTE_CAPABILITIES,
-        _TF_NEED_CUDA,
-        "TF_CUDA_PATHS",
-    ],
+    environ = _ENVIRONS,
 )
 """Detects and configures the NCCL configuration.
 
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index a82839c556c..6e9a22f8063 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -262,14 +262,24 @@ def _python_autoconf_impl(repository_ctx):
     else:
         _create_local_python_repository(repository_ctx)
 
+_ENVIRONS = [
+    BAZEL_SH,
+    PYTHON_BIN_PATH,
+    PYTHON_LIB_PATH,
+]
+
+remote_python_configure = repository_rule(
+    implementation = _create_local_python_repository,
+    environ = _ENVIRONS,
+    remotable = True,
+    attrs = {
+        "environ": attr.string_dict(),
+    },
+)
+
 python_configure = repository_rule(
     implementation = _python_autoconf_impl,
-    environ = [
-        BAZEL_SH,
-        PYTHON_BIN_PATH,
-        PYTHON_LIB_PATH,
-        TF_PYTHON_CONFIG_REPO,
-    ],
+    environ = _ENVIRONS + [TF_PYTHON_CONFIG_REPO],
 )
 """Detects and configures the local Python.
 
diff --git a/third_party/remote_config/BUILD.tpl b/third_party/remote_config/BUILD.tpl
new file mode 100644
index 00000000000..76f360f3e72
--- /dev/null
+++ b/third_party/remote_config/BUILD.tpl
@@ -0,0 +1,11 @@
+platform(
+    name = "platform",
+    constraint_values = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:linux",
+    ],
+    exec_properties = {
+        "container-image": "%{container_image}",
+        "Pool": "default",
+    },
+)
diff --git a/third_party/remote_config/remote_platform_configure.bzl b/third_party/remote_config/remote_platform_configure.bzl
new file mode 100644
index 00000000000..175649da643
--- /dev/null
+++ b/third_party/remote_config/remote_platform_configure.bzl
@@ -0,0 +1,17 @@
+"""Repository rule to create a platform for a docker image to be used with RBE."""
+
+def _remote_platform_configure_impl(repository_ctx):
+    repository_ctx.template(
+        "BUILD",
+        Label("@org_tensorflow//third_party/remote_config:BUILD.tpl"),
+        {
+            "%{container_image}": repository_ctx.attr.container_image,
+        },
+    )
+
+remote_platform_configure = repository_rule(
+    implementation = _remote_platform_configure_impl,
+    attrs = {
+        "container_image": attr.string(mandatory = True),
+    },
+)
diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
index f08ded2fee4..6bd71049248 100644
--- a/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -178,15 +178,25 @@ def _tensorrt_configure_impl(repository_ctx):
 
     _create_local_tensorrt_repository(repository_ctx)
 
+_ENVIRONS = [
+    _TENSORRT_INSTALL_PATH,
+    _TF_TENSORRT_VERSION,
+    _TF_NEED_TENSORRT,
+    "TF_CUDA_PATHS",
+]
+
+remote_tensorrt_configure = repository_rule(
+    implementation = _create_local_tensorrt_repository,
+    environ = _ENVIRONS,
+    remotable = True,
+    attrs = {
+        "environ": attr.string_dict(),
+    },
+)
+
 tensorrt_configure = repository_rule(
     implementation = _tensorrt_configure_impl,
-    environ = [
-        _TENSORRT_INSTALL_PATH,
-        _TF_TENSORRT_VERSION,
-        _TF_TENSORRT_CONFIG_REPO,
-        _TF_NEED_TENSORRT,
-        "TF_CUDA_PATHS",
-    ],
+    environ = _ENVIRONS + [_TF_TENSORRT_CONFIG_REPO],
 )
 """Detects and configures the local CUDA toolchain.
 
diff --git a/third_party/toolchains/remote_config/BUILD b/third_party/toolchains/remote_config/BUILD
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/third_party/toolchains/remote_config/configs.bzl b/third_party/toolchains/remote_config/configs.bzl
new file mode 100644
index 00000000000..2c2bcfb59b3
--- /dev/null
+++ b/third_party/toolchains/remote_config/configs.bzl
@@ -0,0 +1,24 @@
+"""Configurations of RBE builds used with remote config."""
+
+load("//third_party/toolchains/remote_config:rbe_config.bzl", "tensorflow_rbe_config")
+
+def initialize_rbe_configs():
+    tensorflow_rbe_config(
+        name = "ubuntu16.04-py3-gcc7_manylinux2010-cuda10.0-cudnn7-tensorrt5.1",
+        compiler = "/dt7/usr/bin/gcc",
+        compiler_prefix = "/usr/bin",
+        cuda_version = "10.0",
+        cudnn_version = "7",
+        os = "ubuntu16.04-manylinux2010",
+        python_version = "3",
+        tensorrt_install_path = "/usr",
+        tensorrt_version = "5.1",
+    )
+
+    tensorflow_rbe_config(
+        name = "ubuntu16.04-py3_opt-gcc5-rocm",
+        compiler = "gcc",
+        os = "ubuntu16.04",
+        python_version = "3",
+        rocm_version = "2.5",  # Any version will do.
+    )
diff --git a/third_party/toolchains/remote_config/containers.bzl b/third_party/toolchains/remote_config/containers.bzl
new file mode 100644
index 00000000000..8813da19e00
--- /dev/null
+++ b/third_party/toolchains/remote_config/containers.bzl
@@ -0,0 +1,20 @@
+"""Docker images used with remote config and RBE."""
+
+load("//third_party/toolchains/preconfig/generate:containers.bzl", "container_digests")
+
+containers = {
+
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu16.04-manylinux2010.
+    "cuda10.0-cudnn7-ubuntu16.04-manylinux2010": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu16.04-manylinux2010",
+        "digest": container_digests["cuda10.0-cudnn7-ubuntu16.04-manylinux2010"],
+    },
+
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04
+    "rocm-ubuntu16.04": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-rocm-ubuntu16.04",
+        "digest": container_digests["rocm-ubuntu16.04"],
+    },
+}
diff --git a/third_party/toolchains/remote_config/rbe_config.bzl b/third_party/toolchains/remote_config/rbe_config.bzl
new file mode 100644
index 00000000000..ca186f094a7
--- /dev/null
+++ b/third_party/toolchains/remote_config/rbe_config.bzl
@@ -0,0 +1,125 @@
+"""Macro that creates external repositories for remote config."""
+
+load("//third_party/py:python_configure.bzl", "remote_python_configure")
+load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
+load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
+load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
+load("//third_party/tensorrt:tensorrt_configure.bzl", "remote_tensorrt_configure")
+load("//third_party/toolchains/remote_config:containers.bzl", "containers")
+load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
+
+def _container_image_uri(container_name):
+    container = containers[container_name]
+    return "docker://%s/%s@%s" % (container["registry"], container["repository"], container["digest"])
+
+def _tensorflow_rbe_config(name, compiler, python_version, os, rocm_version = None, cuda_version = None, cudnn_version = None, tensorrt_version = None, tensorrt_install_path = None, cudnn_install_path = None, compiler_prefix = None, sysroot = None):
+    if cuda_version == None and rocm_version == None:
+        fail("Neither cuda_version nor rocm_version specified. You need to specify exactly one.")
+
+    if cuda_version != None and rocm_version != None:
+        fail("Specifying both cuda_version and rocm_version is not supported.")
+
+    env = {
+        "ABI_VERSION": "gcc",
+        "ABI_LIBC_VERSION": "glibc_2.19",
+        "BAZEL_COMPILER": compiler,
+        "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+        "BAZEL_TARGET_LIBC": "glibc_2.19",
+        "BAZEL_TARGET_CPU": "k8",
+        "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+        "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+        "CC": compiler,
+        "PYTHON_BIN_PATH": "/usr/bin/python%s" % python_version,
+        "CLEAR_CACHE": "1",
+        "HOST_CXX_COMPILER": compiler,
+        "HOST_C_COMPILER": compiler,
+    }
+
+    if cuda_version != None:
+        # The cuda toolchain currently contains its own C++ toolchain definition,
+        # so we do not fetch local_config_cc.
+        env.update({
+            "TF_NEED_CUDA": "1",
+            "TF_CUDA_CLANG": "1" if compiler.endswith("clang") else "0",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.0,6.0",
+            "TF_ENABLE_XLA": "1",
+            "TF_CUDNN_VERSION": cudnn_version,
+            "TF_CUDA_VERSION": cuda_version,
+            "CUDNN_INSTALL_PATH": cudnn_install_path if cudnn_install_path != None else "/usr/lib/x86_64-linux-gnu",
+            "TF_NEED_TENSORRT": "1",
+            "TF_TENSORRT_VERSION": tensorrt_version,
+            "TENSORRT_INSTALL_PATH": tensorrt_install_path if tensorrt_install_path != None else "/usr/lib/x86_64-linux-gnu",
+            "GCC_HOST_COMPILER_PATH": compiler if not compiler.endswith("clang") else "",
+            "GCC_HOST_COMPILER_PREFIX": compiler_prefix if compiler_prefix != None else "/usr/bin",
+            "CLANG_CUDA_COMPILER_PATH": compiler if compiler.endswith("clang") else "",
+            "TF_SYSROOT": sysroot if sysroot else "",
+        })
+
+        container_name = "cuda%s-cudnn%s-%s" % (cuda_version, cudnn_version, os)
+        container_image = _container_image_uri(container_name)
+        exec_properties = {
+            "container-image": container_image,
+            "Pool": "default",
+        }
+
+        remote_platform_configure(
+            name = "%s_config_platform" % name,
+            container_image = container_image,
+        )
+
+        remote_python_configure(
+            name = "%s_config_python" % name,
+            environ = env,
+            exec_properties = exec_properties,
+        )
+
+        remote_cuda_configure(
+            name = "%s_config_cuda" % name,
+            environ = env,
+            exec_properties = exec_properties,
+        )
+
+        remote_nccl_configure(
+            name = "%s_config_nccl" % name,
+            environ = env,
+            exec_properties = exec_properties,
+        )
+
+        remote_tensorrt_configure(
+            name = "%s_config_tensorrt" % name,
+            environ = env,
+            exec_properties = exec_properties,
+        )
+    elif rocm_version != None:
+        # The rocm toolchain currently contains its own C++ toolchain definition,
+        # so we do not fetch local_config_cc.
+        env.update({
+            "TF_NEED_ROCM": "1",
+            "TF_ENABLE_XLA": "0",
+        })
+
+        container_name = "rocm-%s" % (os)
+        container_image = _container_image_uri(container_name)
+        exec_properties = {
+            "container-image": container_image,
+            "Pool": "default",
+        }
+
+        remote_platform_configure(
+            name = "%s_config_platform" % name,
+            container_image = container_image,
+        )
+
+        remote_python_configure(
+            name = "%s_config_python" % name,
+            environ = env,
+            exec_properties = exec_properties,
+        )
+
+        remote_rocm_configure(
+            name = "%s_config_rocm" % name,
+            environ = env,
+            exec_properties = exec_properties,
+        )
+
+tensorflow_rbe_config = _tensorflow_rbe_config

From 3e8aabf2db7dad080d1016c2f6249bed23121ccb Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Wed, 19 Feb 2020 15:05:29 -0800
Subject: [PATCH 278/442] Remove obsolete code. Replace the last places using
 it with the new version.

PiperOrigin-RevId: 296066537
Change-Id: I8f6748d9d2ea497822f4f65fa4301d2dd67db89c
---
 .../python/autograph/converters/lists.py      |  23 ++-
 .../pyct/static_analysis/activity.py          | 122 ++++++++-------
 .../python/autograph/pyct/transformer.py      | 139 ++++--------------
 .../python/autograph/pyct/transformer_test.py | 137 -----------------
 4 files changed, 109 insertions(+), 312 deletions(-)

diff --git a/tensorflow/python/autograph/converters/lists.py b/tensorflow/python/autograph/converters/lists.py
index 81808017538..253156ceac1 100644
--- a/tensorflow/python/autograph/converters/lists.py
+++ b/tensorflow/python/autograph/converters/lists.py
@@ -40,8 +40,10 @@ from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.autograph.pyct.static_analysis.annos import NodeAnno
 
 
-# Tags for local state.
-POP_USES = 'pop_uses'
+class _Statement(object):
+
+  def __init__(self):
+    self.pop_uses = None
 
 
 class ListTransformer(converter.Base):
@@ -97,9 +99,10 @@ class ListTransformer(converter.Base):
       target_name = 'list_'
     pop_var_name = self.ctx.namer.new_symbol(target_name, scope.referenced)
 
-    pop_uses = self.get_local(POP_USES, [])
-    pop_uses.append((node, pop_var_name))
-    self.set_local(POP_USES, pop_uses)
+    stmt = self.state[_Statement]
+    if stmt.pop_uses is None:
+      stmt.pop_uses = []
+    stmt.pop_uses.append((node, pop_var_name))
 
     return templates.replace_as_expression('var_name', var_name=pop_var_name)
 
@@ -184,7 +187,7 @@ class ListTransformer(converter.Base):
 
   def _postprocess_statement(self, node):
     """Inserts any separate pop() calls that node may use."""
-    pop_uses = self.get_local(POP_USES, None)
+    pop_uses = self.state[_Statement].pop_uses
     if pop_uses:
       replacements = []
       for original_call_node, pop_var_name in pop_uses:
@@ -192,17 +195,13 @@ class ListTransformer(converter.Base):
             self._generate_pop_operation(original_call_node, pop_var_name))
       replacements.append(node)
       node = replacements
-    self.exit_local_scope()
+    self.state[_Statement].exit()
     return node, None
 
-  # TODO(mdan): Should we have a generic visit_block instead?
-  # Right now it feels that a visit_block would add too much magic that's
-  # hard to follow.
-
   def _visit_and_process_block(self, block):
     return self.visit_block(
         block,
-        before_visit=self.enter_local_scope,
+        before_visit=self.state[_Statement].enter,
         after_visit=self._postprocess_statement)
 
   def visit_FunctionDef(self, node):
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity.py b/tensorflow/python/autograph/pyct/static_analysis/activity.py
index 274fb40fbec..73131d6c0fa 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity.py
@@ -205,6 +205,12 @@ class _Comprehension(object):
     self.targets = set()
 
 
+class _FunctionOrClass(object):
+
+  def __init__(self):
+    self.node = None
+
+
 class ActivityAnalyzer(transformer.Base):
   """Annotates nodes with local scope information.
 
@@ -225,10 +231,13 @@ class ActivityAnalyzer(transformer.Base):
 
   @property
   def _in_constructor(self):
-    if len(self.enclosing_entities) > 1:
-      innermost = self.enclosing_entities[-1]
-      parent = self.enclosing_entities[-2]
-      return isinstance(parent, gast.ClassDef) and innermost.name == '__init__'
+    context = self.state[_FunctionOrClass]
+    if context.level > 2:
+      innermost = context.stack[-1].node
+      parent = context.stack[-2].node
+      return (isinstance(parent, gast.ClassDef) and
+              (isinstance(innermost, gast.FunctionDef) and
+               innermost.name == '__init__'))
     return False
 
   def _node_sets_self_attribute(self, node):
@@ -276,7 +285,7 @@ class ActivityAnalyzer(transformer.Base):
 
     elif isinstance(node.ctx, gast.Param):
       self.scope.bound.add(qn)
-      self.scope.mark_param(qn, self.enclosing_entities[-1])
+      self.scope.mark_param(qn, self.state[_FunctionOrClass].node)
 
     elif isinstance(node.ctx, gast.Del):
       # The read matches the Python semantics - attempting to delete an
@@ -414,19 +423,18 @@ class ActivityAnalyzer(transformer.Base):
                              node,
                              is_list_comp=False,
                              is_dict_comp=False):
-    self.state[_Comprehension].enter()
-    self.state[_Comprehension].is_list_comp = is_list_comp
-    # Note: it's important to visit the generators first to properly account
-    # for the variables local to these generators. Example: `x` is local to the
-    # expression `z for x in y for z in x`.
-    node.generators = self.visit_block(node.generators)
-    if is_dict_comp:
-      node.key = self.visit(node.key)
-      node.value = self.visit(node.value)
-    else:
-      node.elt = self.visit(node.elt)
-    self.state[_Comprehension].exit()
-    return node
+    with self.state[_Comprehension] as comprehension_:
+      comprehension_.is_list_comp = is_list_comp
+      # Note: it's important to visit the generators first to properly account
+      # for the variables local to these generators. Example: `x` is local to
+      # the expression `z for x in y for z in x`.
+      node.generators = self.visit_block(node.generators)
+      if is_dict_comp:
+        node.key = self.visit(node.key)
+        node.value = self.visit(node.value)
+      else:
+        node.elt = self.visit(node.elt)
+      return node
 
   def visit_comprehension(self, node):
     # It is important to visit children in this order so that the reads to
@@ -451,51 +459,57 @@ class ActivityAnalyzer(transformer.Base):
     return self._process_statement(node)
 
   def visit_ClassDef(self, node):
-    # The ClassDef node itself has a Scope object that tracks the creation
-    # of its name, along with the usage of any decorator accompanying it.
-    self._enter_scope(False)
-    node.decorator_list = self.visit_block(node.decorator_list)
-    self.scope.modified.add(qual_names.QN(node.name))
-    self.scope.bound.add(qual_names.QN(node.name))
-    node.bases = self.visit_block(node.bases)
-    node.keywords = self.visit_block(node.keywords)
-    self._exit_and_record_scope(node)
+    with self.state[_FunctionOrClass] as fn:
+      fn.node = node
+      # The ClassDef node itself has a Scope object that tracks the creation
+      # of its name, along with the usage of any decorator accompanying it.
+      self._enter_scope(False)
+      node.decorator_list = self.visit_block(node.decorator_list)
+      self.scope.modified.add(qual_names.QN(node.name))
+      self.scope.bound.add(qual_names.QN(node.name))
+      node.bases = self.visit_block(node.bases)
+      node.keywords = self.visit_block(node.keywords)
+      self._exit_and_record_scope(node)
 
-    # A separate Scope tracks the actual class definition.
-    self._enter_scope(True)
-    node = self.generic_visit(node)
-    self._exit_scope()
-    return node
+      # A separate Scope tracks the actual class definition.
+      self._enter_scope(True)
+      node = self.generic_visit(node)
+      self._exit_scope()
+      return node
 
   def visit_FunctionDef(self, node):
-    # The FunctionDef node itself has a Scope object that tracks the creation
-    # of its name, along with the usage of any decorator accompanying it.
-    self._enter_scope(False)
-    node.decorator_list = self.visit_block(node.decorator_list)
-    function_name = qual_names.QN(node.name)
-    self.scope.modified.add(function_name)
-    self.scope.bound.add(function_name)
-    self._exit_and_record_scope(node)
+    with self.state[_FunctionOrClass] as fn:
+      fn.node = node
+      # The FunctionDef node itself has a Scope object that tracks the creation
+      # of its name, along with the usage of any decorator accompanying it.
+      self._enter_scope(False)
+      node.decorator_list = self.visit_block(node.decorator_list)
+      function_name = qual_names.QN(node.name)
+      self.scope.modified.add(function_name)
+      self.scope.bound.add(function_name)
+      self._exit_and_record_scope(node)
 
-    # A separate Scope tracks the actual function definition.
-    self._enter_scope(True)
-    node.args = self.visit(node.args)
+      # A separate Scope tracks the actual function definition.
+      self._enter_scope(True)
+      node.args = self.visit(node.args)
 
-    # Track the body separately. This is for compatibility reasons, it may not
-    # be strictly needed.
-    self._enter_scope(False)
-    node.body = self.visit_block(node.body)
-    self._exit_and_record_scope(node, NodeAnno.BODY_SCOPE)
+      # Track the body separately. This is for compatibility reasons, it may not
+      # be strictly needed.
+      self._enter_scope(False)
+      node.body = self.visit_block(node.body)
+      self._exit_and_record_scope(node, NodeAnno.BODY_SCOPE)
 
-    self._exit_scope()
-    return node
+      self._exit_scope()
+      return node
 
   def visit_Lambda(self, node):
     # Lambda nodes are treated in roughly the same way as FunctionDef nodes.
-    self._enter_scope(True)
-    node = self.generic_visit(node)
-    self._exit_and_record_scope(node)
-    return node
+    with self.state[_FunctionOrClass] as fn:
+      fn.node = node
+      self._enter_scope(True)
+      node = self.generic_visit(node)
+      self._exit_and_record_scope(node)
+      return node
 
   def visit_With(self, node):
     self._enter_scope(False)
diff --git a/tensorflow/python/autograph/pyct/transformer.py b/tensorflow/python/autograph/pyct/transformer.py
index d8b8b6e7168..28cd9427bd1 100644
--- a/tensorflow/python/autograph/pyct/transformer.py
+++ b/tensorflow/python/autograph/pyct/transformer.py
@@ -244,76 +244,14 @@ class Base(gast.NodeTransformer):
     self._lineno = 0
     self._col_offset = 0
     self.ctx = ctx
-    self._enclosing_entities = []
-
-    # A stack that allows keeping mutable, scope-local state where scopes may be
-    # nested. For example, it can be used to track the usage of break
-    # statements in each loop, where loops may be nested.
-    self._local_scope_state = []
-    self.enter_local_scope()
 
     # Allows scoping of local variables to keep state across calls to visit_*
-    # methods. Multiple scope hierarchies may exist and are keyed by tag. A scope
-    # is valid at one or more nodes and all its children. Scopes created in
-    # child nodes supersede their parent. Scopes are isolated from one another.
+    # methods. Multiple scope hierarchies may exist and are keyed by tag. A
+    # scope is valid at one or more nodes and all its children. Scopes created
+    # in child nodes supersede their parent. Scopes are isolated from one
+    # another.
     self.state = _State()
 
-  @property
-  def enclosing_entities(self):
-    return tuple(self._enclosing_entities)
-
-  @property
-  def local_scope_level(self):
-    return len(self._local_scope_state)
-
-  def enter_local_scope(self, inherit=None):
-    """Deprecated.
-
-    Use self.state instead.
-
-    Marks entry into a new local scope.
-
-    Args:
-      inherit: Optional enumerable of variable names to copy from the parent
-        scope.
-    """
-    scope_entered = {}
-    if inherit:
-      this_scope = self._local_scope_state[-1]
-      for name in inherit:
-        if name in this_scope:
-          scope_entered[name] = this_scope[name]
-    self._local_scope_state.append(scope_entered)
-
-  def exit_local_scope(self, keep=None):
-    """Deprecated.
-
-    Use self.state instead.
-
-    Marks exit from the current local scope.
-
-    Args:
-      keep: Optional enumerable of variable names to copy into the parent scope.
-
-    Returns:
-      A dict containing the scope that has just been exited.
-    """
-    scope_left = self._local_scope_state.pop()
-    if keep:
-      this_scope = self._local_scope_state[-1]
-      for name in keep:
-        if name in scope_left:
-          this_scope[name] = scope_left[name]
-    return scope_left
-
-  def set_local(self, name, value):
-    """Deprecated. Use self.state instead."""
-    self._local_scope_state[-1][name] = value
-
-  def get_local(self, name, default=None):
-    """Deprecated. Use self.state instead."""
-    return self._local_scope_state[-1].get(name, default)
-
   def debug_print(self, node):
     """Helper method useful for debugging. Prints the AST."""
     if __debug__:
@@ -479,33 +417,24 @@ class Base(gast.NodeTransformer):
                  type(node))
       raise ValueError(msg)
 
-    did_enter_function = False
-    local_scope_size_at_entry = len(self._local_scope_state)
-    processing_expr_node = False
+    if anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
+      return node
 
     parent_origin = self.ctx.current_origin
-    if isinstance(node, (gast.FunctionDef, gast.ClassDef, gast.Lambda)):
-      did_enter_function = True
-    elif isinstance(node, gast.Expr):
-      processing_expr_node = True
-
-    if did_enter_function:
-      self._enclosing_entities.append(node)
-
     if anno.hasanno(node, anno.Basic.ORIGIN):
       self.ctx.current_origin = anno.getanno(node, anno.Basic.ORIGIN)
 
-    if processing_expr_node:
-      entry_expr_value = node.value
+    try:
+      processing_expr_node = isinstance(node, gast.Expr)
+      if processing_expr_node:
+        entry_expr_value = node.value
 
-    if not anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
       result = super(Base, self).visit(node)
-    self.ctx.current_origin = parent_origin
 
-    # Adjust for consistency: replacing the value of an Expr with
-    # an Assign node removes the need for the Expr node.
-    if processing_expr_node:
-      if isinstance(result, gast.Expr) and result.value != entry_expr_value:
+      # Adjust for consistency: replacing the value of an Expr with
+      # an Assign node removes the need for the Expr node.
+      if (processing_expr_node and isinstance(result, gast.Expr) and
+          (result.value is not entry_expr_value)):
         # When the replacement is a list, it is assumed that the list came
         # from a template that contained a number of statements, which
         # themselves are standalone and don't require an enclosing Expr.
@@ -513,29 +442,21 @@ class Base(gast.NodeTransformer):
                       (list, tuple, gast.Assign, gast.AugAssign)):
           result = result.value
 
-    # By default, all replacements receive the origin info of the replaced node.
-    if result is not node and result is not None:
-      nodes_to_adjust = result
-      if isinstance(result, (list, tuple)):
-        nodes_to_adjust = result
-      else:
-        nodes_to_adjust = (result,)
-      for n in nodes_to_adjust:
-        if not anno.hasanno(n, anno.Basic.ORIGIN):
-          inherited_origin = anno.getanno(
-              node, anno.Basic.ORIGIN, default=parent_origin)
-          if inherited_origin is not None:
-            anno.setanno(n, anno.Basic.ORIGIN, inherited_origin)
+      # By default, all replacements receive the origin info of the replaced
+      # node.
+      if result is not node and result is not None:
+        inherited_origin = anno.getanno(
+            node, anno.Basic.ORIGIN, default=parent_origin)
+        if inherited_origin is not None:
+          nodes_to_adjust = result
+          if isinstance(result, (list, tuple)):
+            nodes_to_adjust = result
+          else:
+            nodes_to_adjust = (result,)
+          for n in nodes_to_adjust:
+            if not anno.hasanno(n, anno.Basic.ORIGIN):
+              anno.setanno(n, anno.Basic.ORIGIN, inherited_origin)
+    finally:
+      self.ctx.current_origin = parent_origin
 
-    # On exception, the local scope integrity is not guaranteed.
-    if did_enter_function:
-      self._enclosing_entities.pop()
-
-    if local_scope_size_at_entry != len(self._local_scope_state):
-      raise AssertionError(
-          'Inconsistent local scope stack. Before entering node %s, the'
-          ' stack had length %d, after exit it has length %d. This'
-          ' indicates enter_local_scope and exit_local_scope are not'
-          ' well paired.' % (node, local_scope_size_at_entry,
-                             len(self._local_scope_state)))
     return result
diff --git a/tensorflow/python/autograph/pyct/transformer_test.py b/tensorflow/python/autograph/pyct/transformer_test.py
index 928f9be4223..05bae8e8f31 100644
--- a/tensorflow/python/autograph/pyct/transformer_test.py
+++ b/tensorflow/python/autograph/pyct/transformer_test.py
@@ -34,62 +34,6 @@ class TransformerTest(test.TestCase):
         source_code=None, source_file=None, future_features=(), namespace=None)
     return transformer.Context(entity_info)
 
-  def test_entity_scope_tracking(self):
-
-    class TestTransformer(transformer.Base):
-
-      # The choice of note to assign to is arbitrary. Using Assign because it's
-      # easy to find in the tree.
-      def visit_Assign(self, node):
-        anno.setanno(node, 'enclosing_entities', self.enclosing_entities)
-        return self.generic_visit(node)
-
-      # This will show up in the lambda function.
-      def visit_BinOp(self, node):
-        anno.setanno(node, 'enclosing_entities', self.enclosing_entities)
-        return self.generic_visit(node)
-
-    tr = TestTransformer(self._simple_context())
-
-    def test_function():
-      a = 0
-
-      class TestClass(object):
-
-        def test_method(self):
-          b = 0
-          def inner_function(x):
-            c = 0
-            d = lambda y: (x + y)
-            return c, d
-          return b, inner_function
-      return a, TestClass
-
-    node, _ = parser.parse_entity(test_function, future_features=())
-    node = tr.visit(node)
-
-    test_function_node = node
-    test_class = test_function_node.body[1]
-    test_method = test_class.body[0]
-    inner_function = test_method.body[1]
-    lambda_node = inner_function.body[1].value
-
-    a = test_function_node.body[0]
-    b = test_method.body[0]
-    c = inner_function.body[0]
-    lambda_expr = lambda_node.body
-
-    self.assertEqual(
-        (test_function_node,), anno.getanno(a, 'enclosing_entities'))
-    self.assertEqual((test_function_node, test_class, test_method),
-                     anno.getanno(b, 'enclosing_entities'))
-    self.assertEqual(
-        (test_function_node, test_class, test_method, inner_function),
-        anno.getanno(c, 'enclosing_entities'))
-    self.assertEqual((test_function_node, test_class, test_method,
-                      inner_function, lambda_node),
-                     anno.getanno(lambda_expr, 'enclosing_entities'))
-
   def assertSameAnno(self, first, second, key):
     self.assertIs(anno.getanno(first, key), anno.getanno(second, key))
 
@@ -203,87 +147,6 @@ class TransformerTest(test.TestCase):
     inner_if_body = outer_if_body[1].body
     self.assertDifferentAnno(inner_if_body[0], outer_if_body[0], 'cond_state')
 
-  def test_local_scope_info_stack(self):
-
-    class TestTransformer(transformer.Base):
-
-      # Extract all string constants from the block.
-      def visit_Constant(self, node):
-        self.set_local(
-            'string', self.get_local('string', default='') + str(node.value))
-        return self.generic_visit(node)
-
-      def _annotate_result(self, node):
-        self.enter_local_scope()
-        node = self.generic_visit(node)
-        anno.setanno(node, 'test', self.get_local('string'))
-        self.exit_local_scope()
-        return node
-
-      def visit_While(self, node):
-        return self._annotate_result(node)
-
-      def visit_For(self, node):
-        return self._annotate_result(node)
-
-    tr = TestTransformer(self._simple_context())
-
-    def test_function(a):
-      """Docstring."""
-      assert a == 'This should not be counted'
-      for i in range(3):
-        _ = 'a'
-        if i > 2:
-          return 'b'
-        else:
-          _ = 'c'
-          while 4:
-            raise '1'
-      return 'nor this'
-
-    node, _ = parser.parse_entity(test_function, future_features=())
-    node = tr.visit(node)
-
-    for_node = node.body[2]
-    while_node = for_node.body[1].orelse[1]
-
-    self.assertFalse(anno.hasanno(for_node, 'string'))
-    self.assertEqual('3a2bc', anno.getanno(for_node, 'test'))
-    self.assertFalse(anno.hasanno(while_node, 'string'))
-    self.assertEqual('41', anno.getanno(while_node, 'test'))
-
-  def test_local_scope_info_stack_checks_integrity(self):
-
-    class TestTransformer(transformer.Base):
-
-      def visit_If(self, node):
-        self.enter_local_scope()
-        return self.generic_visit(node)
-
-      def visit_For(self, node):
-        node = self.generic_visit(node)
-        self.exit_local_scope()
-        return node
-
-    tr = TestTransformer(self._simple_context())
-
-    def no_exit(a):
-      if a > 0:
-        print(a)
-      return None
-
-    node, _ = parser.parse_entity(no_exit, future_features=())
-    with self.assertRaises(AssertionError):
-      tr.visit(node)
-
-    def no_entry(a):
-      for _ in a:
-        print(a)
-
-    node, _ = parser.parse_entity(no_entry, future_features=())
-    with self.assertRaises(AssertionError):
-      tr.visit(node)
-
   def test_visit_block_postprocessing(self):
 
     class TestTransformer(transformer.Base):

From ed371aa5d266222c799a7192e438cdd8c00464fe Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Wed, 19 Feb 2020 15:08:42 -0800
Subject: [PATCH 279/442] Add `shape_signature` to the Java & Python Tensor
 API.

PiperOrigin-RevId: 296067187
Change-Id: I2d98d92967cfe0429a9794685780d5f464b7882d
---
 tensorflow/lite/java/BUILD                    |   6 +++-
 .../main/java/org/tensorflow/lite/Tensor.java |  16 ++++++++++
 .../lite/java/src/main/native/tensor_jni.cc   |  19 ++++++++++++
 .../org/tensorflow/lite/InterpreterTest.java  |  28 ++++++++++++++++++
 .../java/org/tensorflow/lite/TensorTest.java  |   1 +
 .../src/testdata/add_unknown_dimensions.bin   | Bin 0 -> 412 bytes
 .../interpreter_wrapper.cc                    |   5 +++-
 tensorflow/lite/python/lite_test.py           |   3 +-
 8 files changed, 75 insertions(+), 3 deletions(-)
 create mode 100644 tensorflow/lite/java/src/testdata/add_unknown_dimensions.bin

diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index a9db5ddbe88..cf8e6d40f9f 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -11,7 +11,10 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["src/testdata/add.bin"])
+exports_files([
+    "src/testdata/add.bin",
+    "src/testdata/add_unknown_dimensions.bin",
+])
 
 JAVA_SRCS = glob([
     "src/main/java/org/tensorflow/lite/*.java",
@@ -226,6 +229,7 @@ java_test(
     ],
     data = [
         "src/testdata/add.bin",
+        "src/testdata/add_unknown_dimensions.bin",
         "//tensorflow/lite:testdata/multi_add.bin",
         "//tensorflow/lite:testdata/multi_add_flex.bin",
     ],
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
index 8ed019dc3f1..5d15b2c9a7e 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
@@ -84,6 +84,18 @@ public final class Tensor {
     return shapeCopy;
   }
 
+  /**
+   * Returns the original <a
+   * href="https://www.tensorflow.org/resources/dims_types.html#shape">shape</a> of the Tensor,
+   * i.e., the sizes of each dimension - before any resizing was performed. Unknown dimensions are
+   * designated with a value of -1.
+   *
+   * @return an array where the i-th element is the size of the i-th dimension of the tensor.
+   */
+  public int[] shapeSignature() {
+    return shapeSignatureCopy;
+  }
+
   /**
    * Returns the (global) index of the tensor within the owning {@link Interpreter}.
    *
@@ -363,11 +375,13 @@ public final class Tensor {
   private long nativeHandle;
   private final DataType dtype;
   private int[] shapeCopy;
+  private final int[] shapeSignatureCopy;
 
   private Tensor(long nativeHandle) {
     this.nativeHandle = nativeHandle;
     this.dtype = DataType.fromC(dtype(nativeHandle));
     this.shapeCopy = shape(nativeHandle);
+    this.shapeSignatureCopy = shapeSignature(nativeHandle);
   }
 
   private ByteBuffer buffer() {
@@ -386,6 +400,8 @@ public final class Tensor {
 
   private static native int[] shape(long handle);
 
+  private static native int[] shapeSignature(long handle);
+
   private static native int numBytes(long handle);
 
   private static native boolean hasDelegateBufferHandle(long handle);
diff --git a/tensorflow/lite/java/src/main/native/tensor_jni.cc b/tensorflow/lite/java/src/main/native/tensor_jni.cc
index 8beafa0c48e..9a38e85acd1 100644
--- a/tensorflow/lite/java/src/main/native/tensor_jni.cc
+++ b/tensorflow/lite/java/src/main/native/tensor_jni.cc
@@ -438,6 +438,25 @@ Java_org_tensorflow_lite_Tensor_shape(JNIEnv* env, jclass clazz, jlong handle) {
   return result;
 }
 
+JNIEXPORT jintArray JNICALL Java_org_tensorflow_lite_Tensor_shapeSignature(
+    JNIEnv* env, jclass clazz, jlong handle) {
+  TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
+  if (tensor == nullptr) return nullptr;
+
+  int num_dims = 0;
+  int const* data = nullptr;
+  if (tensor->dims_signature != nullptr && tensor->dims_signature->size != 0) {
+    num_dims = tensor->dims_signature->size;
+    data = tensor->dims_signature->data;
+  } else {
+    num_dims = tensor->dims->size;
+    data = tensor->dims->data;
+  }
+  jintArray result = env->NewIntArray(num_dims);
+  env->SetIntArrayRegion(result, 0, num_dims, data);
+  return result;
+}
+
 JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_numBytes(JNIEnv* env,
                                                                 jclass clazz,
                                                                 jlong handle) {
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index cb1cb919c6d..8b18e1764ce 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -37,12 +37,16 @@ public final class InterpreterTest {
       "tensorflow/lite/testdata/multi_add.bin";
   private static final String FLEX_MODEL_PATH =
       "tensorflow/lite/testdata/multi_add_flex.bin";
+  private static final String UNKNOWN_DIMS_MODEL_PATH =
+      "tensorflow/lite/java/src/testdata/add_unknown_dimensions.bin";
 
   private static final ByteBuffer MODEL_BUFFER = TestUtils.getTestFileAsBuffer(MODEL_PATH);
   private static final ByteBuffer MULTIPLE_INPUTS_MODEL_BUFFER =
       TestUtils.getTestFileAsBuffer(MULTIPLE_INPUTS_MODEL_PATH);
   private static final ByteBuffer FLEX_MODEL_BUFFER =
       TestUtils.getTestFileAsBuffer(FLEX_MODEL_PATH);
+  private static final ByteBuffer UNKNOWN_DIMS_MODEL_PATH_BUFFER =
+      TestUtils.getTestFileAsBuffer(UNKNOWN_DIMS_MODEL_PATH);
 
   @Test
   public void testInterpreter() throws Exception {
@@ -218,6 +222,30 @@ public final class InterpreterTest {
     }
   }
 
+  @Test
+  public void testUnknownDims() {
+    try (Interpreter interpreter = new Interpreter(UNKNOWN_DIMS_MODEL_PATH_BUFFER)) {
+      int[] inputDims = {1, 1, 3, 3};
+      int[] inputDimsSignature = {1, -1, 3, 3};
+      assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(inputDims);
+      assertThat(interpreter.getInputTensor(0).shapeSignature()).isEqualTo(inputDimsSignature);
+
+      // Set the dimension of the unknown dimension to the expected dimension and ensure shape
+      // signature doesn't change.
+      inputDims[1] = 3;
+      interpreter.resizeInput(0, inputDims);
+      assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(inputDims);
+      assertThat(interpreter.getInputTensor(0).shapeSignature()).isEqualTo(inputDimsSignature);
+
+      ByteBuffer input =
+          ByteBuffer.allocateDirect(1 * 3 * 3 * 3 * 4).order(ByteOrder.nativeOrder());
+      ByteBuffer output =
+          ByteBuffer.allocateDirect(1 * 3 * 3 * 3 * 4).order(ByteOrder.nativeOrder());
+      interpreter.run(input, output);
+      assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(inputDims);
+    }
+  }
+
   @Test
   public void testRunWithWrongInputType() {
     Interpreter interpreter = new Interpreter(MODEL_BUFFER);
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
index 105ef714b4a..09e9b1cbc8f 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
@@ -73,6 +73,7 @@ public final class TensorTest {
     assertThat(tensor).isNotNull();
     int[] expectedShape = {2, 8, 8, 3};
     assertThat(tensor.shape()).isEqualTo(expectedShape);
+    assertThat(tensor.shapeSignature()).isEqualTo(expectedShape);
     assertThat(tensor.dataType()).isEqualTo(DataType.FLOAT32);
     assertThat(tensor.numBytes()).isEqualTo(2 * 8 * 8 * 3 * 4);
     assertThat(tensor.numElements()).isEqualTo(2 * 8 * 8 * 3);
diff --git a/tensorflow/lite/java/src/testdata/add_unknown_dimensions.bin b/tensorflow/lite/java/src/testdata/add_unknown_dimensions.bin
new file mode 100644
index 0000000000000000000000000000000000000000..47ac92ffa6551ee39fcb4911b55977fa2c7c338d
GIT binary patch
literal 412
zcmYL_!D_-l5Qe83sF=|9kb?&gIrQMkix-bVK@b$Ac<iCUU8oQZNa+*!2z`=1K_9?J
zD#Y*KxOU;o>}F@a%%o>#oB49`(=<(&{S)Lcy2d-=85F0W9LM1u-r$k*YPonVUh2b_
zKYn^Q=FNi*EU}(t_;OCIb4V=U8RYN4FPs2XSc(zlC$KMf2RYT3ocCDeWTgWh87Obc
z46#=3|JApCI3yvq-;z&NZI|+CcGbzJy_V)JowIM;d6+6z=r#{MsV_MTn1N=KIrbul
bNMT)-TmMm4&L3|%sKdWk&L8`P8`WO`!4oHf

literal 0
HcmV?d00001

diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index 9993d0211c2..0ca53f98422 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -345,9 +345,12 @@ PyObject* InterpreterWrapper::TensorSizeSignature(int i) const {
   const TfLiteTensor* tensor = interpreter_->tensor(i);
   const int32_t* size_signature_data = nullptr;
   int32_t size_signature_size = 0;
-  if (tensor->dims_signature != nullptr) {
+  if (tensor->dims_signature != nullptr && tensor->dims_signature->size != 0) {
     size_signature_data = tensor->dims_signature->data;
     size_signature_size = tensor->dims_signature->size;
+  } else {
+    size_signature_data = tensor->dims->data;
+    size_signature_size = tensor->dims->size;
   }
   PyObject* np_array =
       PyArrayFromIntVector(size_signature_data, size_signature_size);
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index 7977b30e7ae..a63ce69cb69 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -462,7 +462,8 @@ class FromSessionTest(TestModels, parameterized.TestCase):
                       3] == input_details[0]['shape_signature']).all())
 
     output_details = interpreter.get_output_details()
-    self.assertFalse(output_details[0]['shape_signature'])
+    self.assertTrue(([1, 16, 16,
+                      3] == output_details[0]['shape_signature']).all())
 
   def testBatchSizeValid(self):
     with ops.Graph().as_default():

From 2dd34ee6f1b98a6c13aca1525975a7653448d787 Mon Sep 17 00:00:00 2001
From: Chenkai Kuang <chenkai@google.com>
Date: Wed, 19 Feb 2020 15:15:09 -0800
Subject: [PATCH 280/442] Enable keras_save_load_test and increase shard_count
 to mitigate test timeout.

PiperOrigin-RevId: 296068534
Change-Id: I9dcf36cf9ebb78c5cd421dc442a7881bc5ddc232
---
 tensorflow/python/distribute/BUILD                   | 2 +-
 tensorflow/python/distribute/keras_save_load_test.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index e201cfa6dbb..bc6865c8617 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1275,7 +1275,7 @@ distribute_py_test(
     srcs = ["keras_save_load_test.py"],
     full_precision = True,
     main = "keras_save_load_test.py",
-    shard_count = 5,
+    shard_count = 7,
     tags = [
         "multi_and_single_gpu",
     ],
diff --git a/tensorflow/python/distribute/keras_save_load_test.py b/tensorflow/python/distribute/keras_save_load_test.py
index bb4c2b843f5..494a348d050 100644
--- a/tensorflow/python/distribute/keras_save_load_test.py
+++ b/tensorflow/python/distribute/keras_save_load_test.py
@@ -66,7 +66,6 @@ class KerasSaveLoadTest(test_base.TestSavedModelBase):
                                           distribution_for_restoring,
                                           save_in_scope,
                                           experimental_run_tf_function):
-    self.skipTest('TODO: b/148245425')
     self.run_test_save_strategy_restore_strategy(model_and_input,
                                                  distribution_for_saving,
                                                  distribution_for_restoring,

From 31fa9a5c4f2eddd4790ee0ecec8f77ae65bb2781 Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Wed, 19 Feb 2020 15:22:16 -0800
Subject: [PATCH 281/442] Make use of GetDataDependencyFilepath and JoinPath to
 build paths which will work across operating systems.

The previous implementation doesn't work correctly on Windows.

PiperOrigin-RevId: 296070105
Change-Id: Ie4b5ecae64807682153dc17e6471984284edc111
---
 .../compiler/xla/service/gpu/llvm_gpu_backend/BUILD |  1 +
 .../xla/service/gpu/llvm_gpu_backend/utils_test.cc  | 13 ++++++++-----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
index f1083553c57..1419a4f792d 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -69,6 +69,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:resource_loader",
         "@llvm-project//llvm:core",
         "@llvm-project//llvm:support",
     ],
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils_test.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils_test.cc
index 8c7f70ebcfb..84e3520c873 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils_test.cc
@@ -17,25 +17,28 @@ limitations under the License.
 
 #include <string>
 
-#include "tensorflow/core/lib/io/path.h"
-
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
-const char kSaxpyIRFile[] =
-    "compiler/xla/service/gpu/llvm_gpu_backend/tests_data/saxpy.ll";
+string SaxpyIRFile() {
+  return tensorflow::io::JoinPath("tensorflow", "compiler", "xla", "service",
+                                  "gpu", "llvm_gpu_backend", "tests_data",
+                                  "saxpy.ll");
+}
 
 TEST(UtilsTest, TestLoadIRModule) {
   llvm::LLVMContext llvm_context;
   string test_srcdir = tensorflow::testing::TensorFlowSrcRoot();
   std::unique_ptr<llvm::Module> module = LoadIRModule(
-      tensorflow::io::JoinPath(test_srcdir, kSaxpyIRFile), &llvm_context);
+      tensorflow::GetDataDependencyFilepath(SaxpyIRFile()), &llvm_context);
   // Sanity check that the module was loaded properly.
   ASSERT_NE(nullptr, module);
   ASSERT_NE(std::string::npos, module->getModuleIdentifier().find("saxpy.ll"));

From 120b0f57f04266fd25edc1ef1bdae9200b570360 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 15:31:17 -0800
Subject: [PATCH 282/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 296072123 Change-Id:
 Iae966a89cba24e82bb7c86ca30dda3581a0f98e8

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 449a95765a5..ecdce1e627b 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21345,7 +21345,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22053,7 +22053,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22249,7 +22249,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22318,7 +22318,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22433,7 +22433,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22492,7 +22492,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22666,7 +22666,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22857,7 +22857,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25297,7 +25297,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25629,7 +25629,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25679,7 +25679,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25929,7 +25929,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26559,7 +26559,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27624,7 +27624,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45536,7 +45536,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 0546460a7d8eaf68f697e18ba0fc9c3b96ab059a Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 19 Feb 2020 15:32:49 -0800
Subject: [PATCH 283/442] [Executor tracing] Add num_output_edges to the
 PropagateOutputs TraceMe.

This enables someone reading a trace to distinguish between long PropagateOutputs events that are due to contention, and long events that are due to a lot of propagation work going on.

PiperOrigin-RevId: 296072451
Change-Id: Ic7b731f3ccb2b0ba12e0e7d23f31e89cef9b0a97
---
 tensorflow/core/common_runtime/executor.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 8d650c21210..0be1d5df616 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -2161,7 +2161,9 @@ void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
   profiler::TraceMe activity(
       [&]() {
         return strings::StrCat("ExecutorPropagateOutputs#", "id=", step_id_,
-                               ",kernel_name=", item->kernel->name_view(), "#");
+                               ",kernel_name=", item->kernel->name_view(),
+                               ",num_output_edges=", item->num_output_edges,
+                               "#");
       },
       profiler::GetTFTraceMeLevel(/*is_expensive=*/false));
 

From 88c4b69a577f3393e5cbfe42054d2ca93f652536 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 15:45:21 -0800
Subject: [PATCH 284/442] Fix GitHub issue templates.

PiperOrigin-RevId: 296075106
Change-Id: I858fde7b2432be833c9524b9dff98d95cb55f96c
---
 .github/ISSUE_TEMPLATE/00-bug-issue.md        | 26 ++++++++++++-------
 .../ISSUE_TEMPLATE/40-tflite-op-request.md    |  8 +++++-
 .../60-tflite-converter-issue.md              |  2 ++
 .../ISSUE_TEMPLATE/80-performance-issue.md    | 26 ++++++++++++-------
 4 files changed, 43 insertions(+), 19 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/00-bug-issue.md b/.github/ISSUE_TEMPLATE/00-bug-issue.md
index bb4a1a7ea14..0c2bcb27c7d 100644
--- a/.github/ISSUE_TEMPLATE/00-bug-issue.md
+++ b/.github/ISSUE_TEMPLATE/00-bug-issue.md
@@ -10,13 +10,20 @@ labels: 'type:bug'
 we only address code/doc bugs, performance issues, feature requests and
 build/installation issues on GitHub. tag:bug_template</em>
 
-**System information** - Have I written custom code (as opposed to using a stock
-example script provided in TensorFlow): - OS Platform and Distribution (e.g.,
-Linux Ubuntu 16.04): - Mobile device (e.g. iPhone 8, Pixel 2, Samsung Galaxy) if
-the issue happens on mobile device: - TensorFlow installed from (source or
-binary): - TensorFlow version (use command below): - Python version: - Bazel
-version (if compiling from source): - GCC/Compiler version (if compiling from
-source): - CUDA/cuDNN version: - GPU model and memory:
+**System information** 
+- Have I written custom code (as opposed to using a stock
+example script provided in TensorFlow): 
+- OS Platform and Distribution (e.g.,
+Linux Ubuntu 16.04): 
+- Mobile device (e.g. iPhone 8, Pixel 2, Samsung Galaxy) if
+the issue happens on mobile device: 
+- TensorFlow installed from (source or
+binary): - TensorFlow version (use command below): 
+- Python version: - Bazel
+version (if compiling from source):
+- GCC/Compiler version (if compiling from
+source): 
+- CUDA/cuDNN version: - GPU model and memory:
 
 You can collect some of this information using our environment capture
 [script](https://github.com/tensorflow/tensorflow/tree/master/tools/tf_env_collect.sh)
@@ -28,8 +35,9 @@ tensorflow as tf; print(tf.GIT_VERSION, tf.VERSION)"` 2. TF 2.0: `python -c
 
 **Describe the expected behavior**
 
-**Code to reproduce the issue** Provide a reproducible test case that is the
-bare minimum necessary to generate the problem.
+**Standalone code to reproduce the issue** 
+Provide a reproducible test case that is the bare minimum necessary to generate
+the problem. If possible, please share a link to Colab/Jupyter/any notebook.
 
 **Other info / logs** Include any logs or source code that would be helpful to
 diagnose the problem. If including tracebacks, please include the full
diff --git a/.github/ISSUE_TEMPLATE/40-tflite-op-request.md b/.github/ISSUE_TEMPLATE/40-tflite-op-request.md
index f4b6733c211..4f1e60b553a 100644
--- a/.github/ISSUE_TEMPLATE/40-tflite-op-request.md
+++ b/.github/ISSUE_TEMPLATE/40-tflite-op-request.md
@@ -17,8 +17,14 @@ labels: 'comp:lite'
 # Copy and paste here
 ```
 
+**Standalone code to reproduce the issue** 
+Provide a reproducible test case that is the bare minimum necessary to generate
+the problem. If possible, please share a link to Colab/Jupyter/any notebook.
+
 Also, please include a link to a GraphDef or the model if possible.
 
 **Any other info / logs**
 
-Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached.
+Include any logs or source code that would be helpful to diagnose the problem.
+If including tracebacks, please include the full traceback. Large logs and files
+should be attached.
diff --git a/.github/ISSUE_TEMPLATE/60-tflite-converter-issue.md b/.github/ISSUE_TEMPLATE/60-tflite-converter-issue.md
index 3cd6e977d2f..32ebaff1a9c 100644
--- a/.github/ISSUE_TEMPLATE/60-tflite-converter-issue.md
+++ b/.github/ISSUE_TEMPLATE/60-tflite-converter-issue.md
@@ -1,6 +1,7 @@
 ---
 name: TensorFlow Lite New Converter Issue
 about: Use this template for reporting issues during model conversion to TFLite
+labels: 'TFLiteConverter'
 
 ---
 
@@ -12,6 +13,7 @@ about: Use this template for reporting issues during model conversion to TFLite
 
 
 **Command used to run the converter or code if you’re using the Python API**
+If possible, please share a link to Colab/Jupyter/any notebook.
 
 ```
 # Copy and paste here the exact command
diff --git a/.github/ISSUE_TEMPLATE/80-performance-issue.md b/.github/ISSUE_TEMPLATE/80-performance-issue.md
index 2090801742c..a1cbf23df4b 100644
--- a/.github/ISSUE_TEMPLATE/80-performance-issue.md
+++ b/.github/ISSUE_TEMPLATE/80-performance-issue.md
@@ -11,13 +11,20 @@ As per our
 we only address code/doc bugs, performance issues, feature requests and
 build/installation issues on GitHub. tag:performance_template</em>
 
-**System information** - Have I written custom code (as opposed to using a stock
-example script provided in TensorFlow): - OS Platform and Distribution (e.g.,
-Linux Ubuntu 16.04): - Mobile device (e.g. iPhone 8, Pixel 2, Samsung Galaxy) if
-the issue happens on mobile device: - TensorFlow installed from (source or
-binary): - TensorFlow version (use command below): - Python version: - Bazel
-version (if compiling from source): - GCC/Compiler version (if compiling from
-source): - CUDA/cuDNN version: - GPU model and memory:
+**System information** 
+- Have I written custom code (as opposed to using a stock
+example script provided in TensorFlow): 
+- OS Platform and Distribution (e.g.,
+Linux Ubuntu 16.04): 
+- Mobile device (e.g. iPhone 8, Pixel 2, Samsung Galaxy) if
+the issue happens on mobile device: 
+- TensorFlow installed from (source or
+binary): - TensorFlow version (use command below): 
+- Python version: - Bazel
+version (if compiling from source):
+- GCC/Compiler version (if compiling from
+source): 
+- CUDA/cuDNN version: - GPU model and memory:
 
 You can collect some of this information using our environment capture
 [script](https://github.com/tensorflow/tensorflow/tree/master/tools/tf_env_collect.sh)
@@ -29,8 +36,9 @@ tensorflow as tf; print(tf.GIT_VERSION, tf.VERSION)"` 2. TF 2.0: `python -c
 
 **Describe the expected behavior**
 
-**Code to reproduce the issue** Provide a reproducible test case that is the
-bare minimum necessary to generate the problem.
+**Standalone code to reproduce the issue** 
+Provide a reproducible test case that is the bare minimum necessary to generate
+the problem. If possible, please share a link to Colab/Jupyter/any notebook.
 
 **Other info / logs** Include any logs or source code that would be helpful to
 diagnose the problem. If including tracebacks, please include the full

From 867d3c97082cb2d26036d129ef7b51f3867a19d3 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 19 Feb 2020 15:45:36 -0800
Subject: [PATCH 285/442] [SparseTensor] Optimize the `tf.sparse.to_dense()`
 implementation.

This change includes several optimizations:

1. Introduce `SparseTensor::IndicesValidVectorFastPath()`, for validating the
   indices of a 1-D SparseTensor. The optimized code is similar to
   `IndicesValid32BitFastPath()`, which optimistically assumes that the tensor
   is valid and falls back to slower code in the failure case, except it does
   not have the 32-bit limitation. The compiler is able to vectorize the loop
   over the indices, for increased throughput.

2. Implement fast paths for 1-D and 2-D inputs in `SparseTensor::ToDense()`.
   The main win here comes from avoiding the data-dependent loop over
   dimensions when computing the index of the output value. We also avoid
   an unnecessary integer multiplication (by 1) in each case.

3. Minor optimizations to the 3+-D case in `SparseTensor::ToDense()`, avoiding
   unnecessary calls to `TensorShape::dim_size()` and using pointer arithmetic
   rather than Eigen logic to dereference index elements.

4. Minor optimizations to the `SparseTensor::Create()` method, which now
   assigns directly to the relevant fields of the result instead of invoking
   the `SparseTensor` constructor and the move assignment operator. In this
   case the existing move logic wasn't saving us much, because the `Tensor` and
   `gtl::InlinedVector` move constructors still have to copy quite a lot of
   data.

5. Minor optimizations to the `SparseToDense::Compute()` method. In particular,
   we avoid allocating a temporary tensor for the indices when the input is
   DT_INT64 (which is the common case, since all `tf.SparseTensor` objects have
   64-bit indices).

PiperOrigin-RevId: 296075159
Change-Id: I0b051621920aec9b2a8dc6c7ecbf55e5b2d59098
---
 tensorflow/core/kernels/sparse_to_dense_op.cc | 44 +++++-----
 tensorflow/core/util/sparse/sparse_tensor.cc  | 54 +++++++++++--
 tensorflow/core/util/sparse/sparse_tensor.h   | 81 +++++++++++++------
 3 files changed, 133 insertions(+), 46 deletions(-)

diff --git a/tensorflow/core/kernels/sparse_to_dense_op.cc b/tensorflow/core/kernels/sparse_to_dense_op.cc
index d9626052b0c..da4e7e070db 100644
--- a/tensorflow/core/kernels/sparse_to_dense_op.cc
+++ b/tensorflow/core/kernels/sparse_to_dense_op.cc
@@ -20,14 +20,13 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-
 #include <numeric>
 #include <sstream>
 #include <string>
 #include <unordered_map>
 #include <utility>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -35,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/util/ptr_util.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
 
 namespace tensorflow {
@@ -93,36 +93,44 @@ class SparseToDense : public OpKernel {
     Tensor* output = nullptr;
     OP_REQUIRES_OK(c, c->allocate_output(0, output_tensor_shape, &output));
 
-    TensorShape ix_shape({num_elems, num_dims});
-    Tensor indices_shaped(DT_INT64, ix_shape);
-    if (indices.dtype() == DT_INT64) {
-      CHECK(indices_shaped.CopyFrom(indices, ix_shape));
+    const Tensor* indices_shaped;
+    std::unique_ptr<Tensor> indices_shaped_holder;
+    if (indices.dtype() == DT_INT64 && indices.dims() == 2) {
+      indices_shaped = &indices;
     } else {
-      indices_shaped.matrix<int64>() =
-          indices.shaped<Index, 2>(ix_shape.dim_sizes()).template cast<int64>();
+      TensorShape ix_shape({num_elems, num_dims});
+      indices_shaped_holder = MakeUnique<Tensor>(DT_INT64, ix_shape);
+      indices_shaped = indices_shaped_holder.get();
+      if (indices.dtype() == DT_INT64) {
+        CHECK(indices_shaped_holder->CopyFrom(indices, ix_shape));
+      } else {
+        indices_shaped_holder->matrix<int64>() =
+            indices.shaped<Index, 2>(ix_shape.dim_sizes())
+                .template cast<int64>();
+      }
     }
 
     // If we received a scalar, we'll need to create a new
     // tensor with copies of the values as a vec.
-    // TODO(ebrevdo): find a way to avoid this temp allocation.
-    Tensor sparse_values_b;
+    const Tensor* sparse_values_b;
+    std::unique_ptr<Tensor> sparse_values_b_holder;
 
     if (TensorShapeUtils::IsScalar(sparse_values.shape())) {
-      OP_REQUIRES_OK(
-          c, c->allocate_temp(DataTypeToEnum<T>::value,
-                              TensorShape({num_elems}), &sparse_values_b));
-      sparse_values_b.vec<T>().setConstant(sparse_values.scalar<T>()());
+      sparse_values_b_holder = MakeUnique<Tensor>(DataTypeToEnum<T>::value,
+                                                  TensorShape({num_elems}));
+      sparse_values_b = sparse_values_b_holder.get();
+      sparse_values_b_holder->vec<T>().setConstant(sparse_values.scalar<T>()());
     } else {
-      sparse_values_b = sparse_values;
+      sparse_values_b = &sparse_values;
     }
 
     // Assume SparseTensor is lexicographically sorted.
     gtl::InlinedVector<int64, 8> order(output->shape().dims());
     std::iota(order.begin(), order.end(), 0);
     sparse::SparseTensor st;
-    OP_REQUIRES_OK(c,
-                   sparse::SparseTensor::Create(indices_shaped, sparse_values_b,
-                                                output->shape(), order, &st));
+    OP_REQUIRES_OK(
+        c, sparse::SparseTensor::Create(*indices_shaped, *sparse_values_b,
+                                        output->shape(), order, &st));
 
     if (validate_indices_) {
       OP_REQUIRES_OK(c, st.IndicesValid());
diff --git a/tensorflow/core/util/sparse/sparse_tensor.cc b/tensorflow/core/util/sparse/sparse_tensor.cc
index e58bd95f5a6..256ba57f1b6 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.cc
+++ b/tensorflow/core/util/sparse/sparse_tensor.cc
@@ -65,7 +65,11 @@ Status GetDimsFromIx(const Tensor& ix, int* result) {
     return errors::InvalidArgument("Shape rank must be SparseTensor rank.");
   }
 
-  *result = SparseTensor(std::move(ix), std::move(vals), shape, order);
+  result->ix_ = std::move(ix);
+  result->vals_ = std::move(vals);
+  result->shape_.assign(shape.begin(), shape.end());
+  result->order_.assign(order.begin(), order.end());
+  result->dims_ = dims;
   return Status::OK();
 }
 
@@ -108,6 +112,37 @@ SparseTensor::SparseTensor(Tensor ix, Tensor vals, const VarDimArray shape,
   DCHECK_EQ(shape.size(), dims_) << "Shape rank must be SparseTensor rank.";
 }
 
+// Optimized version of `IndicesValid()` with the following requirements:
+// * The sparse tensor is one-dimensional.
+//
+// Returns true if the indices are valid, otherwise false.
+// NOTE(mrry): If this method returns false, call IndicesValidHelper<true>()
+// to obtain a meaningful error message.
+bool SparseTensor::IndicesValidVectorFastPath() const {
+  DCHECK_EQ(shape_.size(), 1);
+  DCHECK_EQ(order_[0], 0);
+
+  const int64 max_index = shape_[0];
+
+  // We maintain separate bools for each validation predicate to enable
+  // vectorization across loop iterations.
+  bool index_in_range_valid = true;
+  bool order_valid = true;
+
+  int64 prev_index = -1;
+  const auto ix_t = ix_.matrix<int64>();
+  const int64* const index_base_ptr = ix_t.data();
+
+  for (std::size_t n = 0; n < ix_t.dimension(0); ++n) {
+    const int64 index = index_base_ptr[n];
+    index_in_range_valid = index_in_range_valid & (index < max_index);
+    order_valid = order_valid & (index > prev_index);
+    prev_index = index;
+  }
+
+  return index_in_range_valid & order_valid;
+}
+
 // Optimized version of `IndicesValid()` with the following requirements:
 // * The sparse tensor is two-dimensional.
 // * The tensor's indices are in the "standard" (lexicographic) order.
@@ -116,7 +151,7 @@ SparseTensor::SparseTensor(Tensor ix, Tensor vals, const VarDimArray shape,
 // Returns true if the indices are valid, otherwise false.
 // NOTE(mrry): If this method returns false, call IndicesValidHelper<true>()
 // to obtain a meaningful error message.
-bool SparseTensor::IndicesValid32BitFastPath() const {
+bool SparseTensor::IndicesValidMatrix32BitFastPath() const {
   const auto ix_t = ix_.matrix<int64>();
   const int64* const shape_ptr = shape_.data();
 
@@ -241,6 +276,10 @@ Status SparseTensor::IndicesValidHelper() const {
 }
 
 Status SparseTensor::IndicesValid() const {
+  if (shape_.size() == 1 && IndicesValidVectorFastPath()) {
+    return Status::OK();
+  }
+
   bool standard_order = true;
   for (size_t i = 0; i < order_.size(); ++i) {
     if (order_[i] < 0) {
@@ -252,9 +291,14 @@ Status SparseTensor::IndicesValid() const {
   }
 
   if (standard_order) {
-    if (shape_.size() == 2 && shape_[0] <= std::numeric_limits<int32>::max() &&
-        shape_[1] <= std::numeric_limits<int32>::max()) {
-      if (IndicesValid32BitFastPath()) {
+    if (shape_.size() == 1) {
+      if (IndicesValidVectorFastPath()) {
+        return Status::OK();
+      }
+    } else if (shape_.size() == 2 &&
+               shape_[0] <= std::numeric_limits<int32>::max() &&
+               shape_[1] <= std::numeric_limits<int32>::max()) {
+      if (IndicesValidMatrix32BitFastPath()) {
         return Status::OK();
       }
     }
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index 03ae4fe3f68..2654d126e86 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -201,7 +201,14 @@ class SparseTensor {
     return vec;
   }
 
-  bool IndicesValid32BitFastPath() const;
+  // Optimized implementation of `IndicesValid` for 1-D sparse tensors.
+  // REQUIRES: `shape_.size() == 1`.
+  bool IndicesValidVectorFastPath() const;
+
+  // Optimized implementation of `IndicesValid` for 2-D sparse tensors whose
+  // indices fit within the range of an `int32`.
+  // REQUIRES: `shape_.size() == 2`.
+  bool IndicesValidMatrix32BitFastPath() const;
 
   template <bool standard_order>
   Status IndicesValidHelper() const;
@@ -354,32 +361,60 @@ inline bool SparseTensor::ToDense(Tensor* out, bool initialize) {
   if (!ValidateAndInitializeToDense<T>(out, initialize)) return false;
 
   auto out_t = out->flat<T>();
-  auto ix_t = ix_.matrix<int64>();
   auto vals_t = vals_.vec<T>();
+  auto ix_t = ix_.matrix<int64>();
+  const int64* const ix_ptr = ix_t.data();
 
-  std::vector<int64> strides(dims_);
-  const auto& out_shape = out->shape();
-  if (dims_ > 0) {
-    strides[dims_ - 1] = 1;
-  }
-  for (int d = dims_ - 2; d >= 0; --d) {
-    strides[d] = strides[d + 1] * out_shape.dim_size(d + 1);
-  }
-
-  for (int n = 0; n < vals_t.dimension(0); ++n) {
-    bool invalid_dims = false;
-    int64 ix = 0;
-    for (int d = 0; d < dims_; ++d) {
-      const int64 ix_n_d = internal::SubtleMustCopy(ix_t(n, d));
-      if (!FastBoundsCheck(ix_n_d, out_shape.dim_size(d))) {
-        invalid_dims = true;
-      }
-      ix += strides[d] * ix_n_d;
+  if (dims_ == 1) {
+    // Fast path for sparse vectors.
+    const int64 out_length = out->shape().dim_size(0);
+    for (int n = 0; n < vals_t.dimension(0); ++n) {
+      const int64 index = internal::SubtleMustCopy(ix_ptr[n]);
+      if (!FastBoundsCheck(index, out_length)) return false;
+      out_t(index) = vals_t(n);
     }
-    if (invalid_dims) return false;
-    out_t(ix) = vals_t(n);
+    return true;
+  } else if (dims_ == 2) {
+    // Fast path for sparse matrices.
+    const auto& out_shape = out->shape();
+    const int64 out_rows = out_shape.dim_size(0);
+    const int64 out_cols = out_shape.dim_size(1);
+    for (int n = 0; n < vals_t.dimension(0); ++n) {
+      const int64 row_index = internal::SubtleMustCopy(ix_ptr[n * 2]);
+      const int64 col_index = internal::SubtleMustCopy(ix_ptr[n * 2 + 1]);
+      if (!(FastBoundsCheck(row_index, out_rows) &&
+            FastBoundsCheck(col_index, out_cols))) {
+        return false;
+      }
+      out_t(row_index * out_cols + col_index) = vals_t(n);
+    }
+    return true;
+  } else {
+    // General path for N-dimensional sparse tensors.
+    gtl::InlinedVector<int64, 4> strides(dims_);
+    const auto& out_shape = out->shape().dim_sizes();
+    if (dims_ > 0) {
+      strides[dims_ - 1] = 1;
+    }
+    for (int d = dims_ - 2; d >= 0; --d) {
+      strides[d] = strides[d + 1] * out_shape[d + 1];
+    }
+
+    for (int n = 0; n < vals_t.dimension(0); ++n) {
+      bool invalid_dims = false;
+      int64 ix = 0;
+      for (int d = 0; d < dims_; ++d) {
+        const int64 ix_n_d = internal::SubtleMustCopy(ix_ptr[n * dims_ + d]);
+        if (!FastBoundsCheck(ix_n_d, out_shape[d])) {
+          invalid_dims = true;
+        }
+        ix += strides[d] * ix_n_d;
+      }
+      if (invalid_dims) return false;
+      out_t(ix) = vals_t(n);
+    }
+    return true;
   }
-  return true;
 }
 
 template <typename T>

From 80e01e89051f34822353a514a8afe388164b93b0 Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Wed, 19 Feb 2020 15:48:23 -0800
Subject: [PATCH 286/442] Strip default attributes before sending a remote
 RegisterFunction request, in order to support forward compatibility across
 RPCs.

PiperOrigin-RevId: 296075787
Change-Id: If536f03ab7d37fdba5d3431995d4b28d561ec78c
---
 .../core/common_runtime/eager/context.cc      |  3 +++
 .../eager/cluster_function_library_runtime.cc | 14 +++++++++++
 .../eager/eager_service_impl_test.cc          | 25 +++++++++++++++++++
 3 files changed, 42 insertions(+)

diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index f4e998a1c1e..7c7f1b3f498 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -570,6 +570,9 @@ Status EagerContext::RegisterExistingFunctionsOnRemoteWorkers(
       eager::RegisterFunctionOp* register_function =
           request->add_queue()->mutable_register_function();
       *register_function->mutable_function_def() = *function_defs[j];
+      StripDefaultAttributes(
+          *OpRegistry::Global(),
+          register_function->mutable_function_def()->mutable_node_def());
       auto* response = new eager::EnqueueResponse;
       eager_client->StreamingEnqueueAsync(
           request, response, [request, response](const Status& s) {
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
index 06e74bfdad6..b9b4183ced4 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
@@ -24,11 +24,24 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/eager/remote_execute_node.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_mgr.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 
 namespace tensorflow {
 namespace eager {
+namespace {
+void StripDefaultAttributesInRegisterFunctionOp(
+    RegisterFunctionOp* register_function) {
+  StripDefaultAttributes(
+      *OpRegistry::Global(),
+      register_function->mutable_function_def()->mutable_node_def());
+  for (auto& function :
+       *register_function->mutable_library()->mutable_function()) {
+    StripDefaultAttributes(*OpRegistry::Global(), function.mutable_node_def());
+  }
+}
+}  // namespace
 
 void EagerClusterFunctionLibraryRuntime::Instantiate(
     const string& function_name, const FunctionLibraryDefinition& lib_def,
@@ -85,6 +98,7 @@ void EagerClusterFunctionLibraryRuntime::Instantiate(
   *register_function->mutable_library() =
       func_lib_def.ReachableDefinitions(register_function->function_def())
           .ToProto();
+  StripDefaultAttributesInRegisterFunctionOp(register_function);
 
   eager_client->EnqueueAsync(
       request, response,
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 87459f4bb39..686f471ca5e 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -234,6 +234,12 @@ tensorflow::FunctionDef MatMulFunction() {
       "          type: DT_FLOAT"
       "        }"
       "      }"
+      "      attr {"
+      "        key: 'transpose_a'"
+      "        value {"
+      "          b: false"
+      "        }"
+      "      }"
       "    }"
       "    ret {"
       "      key: 'm'"
@@ -470,6 +476,15 @@ class FunctionWithRemoteInputsTest : public EagerServiceImplTest {
         serialize_remote_handle_;
   };
 
+  bool MatMulHasAttrWithDefaultValue(const tensorflow::FunctionDef& fdef) {
+    for (const auto& node : fdef.node_def()) {
+      if (node.op() == "MatMul") {
+        return node.attr().find("transpose_a") != node.attr().end();
+      }
+    }
+    return false;
+  }
+
   void Init() {
     CreateContextRequest request;
     request.mutable_server_def()->set_job_name("localhost");
@@ -559,8 +574,18 @@ TEST_F(FunctionWithRemoteInputsTest, EagerPFLRTest) {
   options.is_multi_device_function = true;
   options.input_devices.push_back(local_device_);
   FunctionLibraryRuntime::Handle handle;
+  EXPECT_TRUE(MatMulHasAttrWithDefaultValue(fdef_));
   TF_ASSERT_OK(eager_pflr_->Instantiate(
       fdef_.signature().name(), AttrSlice(&fdef_.attr()), options, &handle));
+  EagerContext* ctx = nullptr;
+  TF_ASSERT_OK(eager_service_impl_.GetEagerContext(context_id_, &ctx));
+  for (const string& func_name : ctx->FuncLibDef()->ListFunctionNames()) {
+    const FunctionDef* fdef = ctx->FuncLibDef()->Find(func_name);
+    EXPECT_TRUE(fdef != nullptr);
+    if (absl::StartsWith(func_name, "MatMulFunction")) {
+      EXPECT_FALSE(MatMulHasAttrWithDefaultValue(*fdef));
+    }
+  }
   bool is_cross_process = false;
   TF_CHECK_OK(eager_pflr_->IsCrossProcess(handle, &is_cross_process));
   EXPECT_TRUE(is_cross_process);

From 6d5c688b162f0489822cba41ac573bde87f5b639 Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Wed, 19 Feb 2020 15:49:31 -0800
Subject: [PATCH 287/442] Make use of GetDataDependencyFilepath and JoinPath to
 build paths which will work across operating systems.

The previous implementation doesn't work correctly on Windows.

PiperOrigin-RevId: 296076015
Change-Id: I8c0876ab01bd1802657e51d9fc4b06271a2fea5c
---
 tensorflow/compiler/xla/service/gpu/BUILD                 | 1 +
 .../xla/service/gpu/hlo_algorithm_blacklist_test.cc       | 8 +++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 1f1efbd8545..c812272829a 100755
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -1658,6 +1658,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:resource_loader",
         "//tensorflow/stream_executor:dnn",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist_test.cc b/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist_test.cc
index bf9ac31559a..bc24f486668 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_algorithm_blacklist_test.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/stream_executor/dnn.h"
 
@@ -31,9 +33,9 @@ class BlacklistTest : public testing::Test {
         "XLA_FLAGS",
         absl::StrCat(
             "--xla_gpu_algorithm_blacklist_path=",
-            tensorflow::io::JoinPath(tensorflow::testing::TensorFlowSrcRoot(),
-                                     "compiler", "xla", "service", "gpu",
-                                     "data", "hlo_algorithm_blacklist.pbtxt"))
+            tensorflow::GetDataDependencyFilepath(tensorflow::io::JoinPath(
+                "tensorflow", "compiler", "xla", "service", "gpu", "data",
+                "hlo_algorithm_blacklist.pbtxt")))
             .data(),
         0);
   }

From fe0374153ef3e5f0f4104666ca83200dcfdbae0a Mon Sep 17 00:00:00 2001
From: Nat Jeffries <njeff@google.com>
Date: Wed, 19 Feb 2020 15:53:39 -0800
Subject: [PATCH 288/442] Allow operator invoke calls to return values other
 than kTfLiteOk and kTfLiteError.  Abort the invoke loop if a non-kTfLiteOk
 return value is found.

PiperOrigin-RevId: 296076951
Change-Id: Ibadc55d18e61231630cf82fdb6e5d283ad3a489d
---
 tensorflow/lite/micro/micro_interpreter.cc | 32 ++++++++++++----------
 tensorflow/lite/micro/micro_interpreter.h  |  3 ++
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/tensorflow/lite/micro/micro_interpreter.cc b/tensorflow/lite/micro/micro_interpreter.cc
index 45254e04d7e..76d9a7aea23 100644
--- a/tensorflow/lite/micro/micro_interpreter.cc
+++ b/tensorflow/lite/micro/micro_interpreter.cc
@@ -161,23 +161,23 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
     }
   }
 
-    for (size_t i = 0; i < operators_->size(); ++i) {
-      auto* node = &(node_and_registrations_[i].node);
-      auto* registration = node_and_registrations_[i].registration;
-      if (registration->prepare) {
-        TfLiteStatus prepare_status = registration->prepare(&context_, node);
-        if (prepare_status != kTfLiteOk) {
-          TF_LITE_REPORT_ERROR(
-              error_reporter_,
-              "Node %s (number %d) failed to prepare with status %d",
-              OpNameFromRegistration(registration), i, prepare_status);
-          return kTfLiteError;
-        }
+  for (size_t i = 0; i < operators_->size(); ++i) {
+    auto* node = &(node_and_registrations_[i].node);
+    auto* registration = node_and_registrations_[i].registration;
+    if (registration->prepare) {
+      TfLiteStatus prepare_status = registration->prepare(&context_, node);
+      if (prepare_status != kTfLiteOk) {
+        TF_LITE_REPORT_ERROR(
+            error_reporter_,
+            "Node %s (number %d) failed to prepare with status %d",
+            OpNameFromRegistration(registration), i, prepare_status);
+        return kTfLiteError;
       }
     }
+  }
 
-    tensors_allocated_ = true;
-    return kTfLiteOk;
+  tensors_allocated_ = true;
+  return kTfLiteOk;
 }
 
 TfLiteStatus MicroInterpreter::Invoke() {
@@ -199,12 +199,14 @@ TfLiteStatus MicroInterpreter::Invoke() {
 
     if (registration->invoke) {
       TfLiteStatus invoke_status = registration->invoke(&context_, node);
-      if (invoke_status != kTfLiteOk) {
+      if (invoke_status == kTfLiteError) {
         TF_LITE_REPORT_ERROR(
             error_reporter_,
             "Node %s (number %d) failed to invoke with status %d",
             OpNameFromRegistration(registration), i, invoke_status);
         return kTfLiteError;
+      } else if (invoke_status != kTfLiteOk) {
+        return invoke_status;
       }
     }
   }
diff --git a/tensorflow/lite/micro/micro_interpreter.h b/tensorflow/lite/micro/micro_interpreter.h
index 4d02769cc3b..ad3a4fe3253 100644
--- a/tensorflow/lite/micro/micro_interpreter.h
+++ b/tensorflow/lite/micro/micro_interpreter.h
@@ -45,6 +45,9 @@ class MicroInterpreter {
   // intermediate tensors.
   TfLiteStatus AllocateTensors();
 
+  // In order to support partial graph runs for strided models, this can return
+  // values other than kTfLiteOk and kTfLiteError.
+  // TODO(b/149795762): Add this to the TfLiteStatus enum.
   TfLiteStatus Invoke();
 
   size_t tensors_size() const { return context_.tensors_size; }

From 74c9e141067c804bb9a5f94df9342d270cc01f75 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Wed, 19 Feb 2020 16:05:05 -0800
Subject: [PATCH 289/442] Fix ConvLSTM2D layer with initial states.

Removing most of the duplicated logic and allow RNN layer to handle it.

Fix #35306

PiperOrigin-RevId: 296079716
Change-Id: I2a5506f7fad34405b0b2ff0dae13c14682e9a349
---
 .../keras/layers/convolutional_recurrent.py   | 73 ++-----------------
 .../layers/convolutional_recurrent_test.py    | 30 ++++++++
 2 files changed, 35 insertions(+), 68 deletions(-)

diff --git a/tensorflow/python/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/layers/convolutional_recurrent.py
index e5fb30083a4..7b8b51c5276 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent.py
@@ -28,7 +28,6 @@ from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_spec import InputSpec
-from tensorflow.python.keras.layers.recurrent import _standardize_args
 from tensorflow.python.keras.layers.recurrent import DropoutRNNCellMixin
 from tensorflow.python.keras.layers.recurrent import RNN
 from tensorflow.python.keras.utils import conv_utils
@@ -292,55 +291,6 @@ class ConvRNN2D(RNN):
     else:
       return [initial_state]
 
-  def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
-    inputs, initial_state, constants = _standardize_args(
-        inputs, initial_state, constants, self._num_constants)
-
-    if initial_state is None and constants is None:
-      return super(ConvRNN2D, self).__call__(inputs, **kwargs)
-
-    # If any of `initial_state` or `constants` are specified and are Keras
-    # tensors, then add them to the inputs and temporarily modify the
-    # input_spec to include them.
-
-    additional_inputs = []
-    additional_specs = []
-    if initial_state is not None:
-      kwargs['initial_state'] = initial_state
-      additional_inputs += initial_state
-      self.state_spec = []
-      for state in initial_state:
-        shape = K.int_shape(state)
-        self.state_spec.append(InputSpec(shape=shape))
-
-      additional_specs += self.state_spec
-    if constants is not None:
-      kwargs['constants'] = constants
-      additional_inputs += constants
-      self.constants_spec = [InputSpec(shape=K.int_shape(constant))
-                             for constant in constants]
-      self._num_constants = len(constants)
-      additional_specs += self.constants_spec
-    # at this point additional_inputs cannot be empty
-    for tensor in additional_inputs:
-      if K.is_keras_tensor(tensor) != K.is_keras_tensor(additional_inputs[0]):
-        raise ValueError('The initial state or constants of an RNN'
-                         ' layer cannot be specified with a mix of'
-                         ' Keras tensors and non-Keras tensors')
-
-    if K.is_keras_tensor(additional_inputs[0]):
-      # Compute the full input spec, including state and constants
-      full_input = [inputs] + additional_inputs
-      full_input_spec = self.input_spec + additional_specs
-      # Perform the call with temporarily replaced input_spec
-      original_input_spec = self.input_spec
-      self.input_spec = full_input_spec
-      output = super(ConvRNN2D, self).__call__(full_input, **kwargs)
-      self.input_spec = original_input_spec
-      return output
-    else:
-      return super(ConvRNN2D, self).__call__(inputs, **kwargs)
-
   def call(self,
            inputs,
            mask=None,
@@ -349,23 +299,11 @@ class ConvRNN2D(RNN):
            constants=None):
     # note that the .build() method of subclasses MUST define
     # self.input_spec and self.state_spec with complete input shapes.
-    if isinstance(inputs, list):
-      inputs = inputs[0]
-    if initial_state is not None:
-      pass
-    elif self.stateful:
-      initial_state = self.states
-    else:
-      initial_state = self.get_initial_state(inputs)
+    inputs, initial_state, constants = self._process_inputs(
+        inputs, initial_state, constants)
 
     if isinstance(mask, list):
       mask = mask[0]
-
-    if len(initial_state) != len(self.states):
-      raise ValueError('Layer has ' + str(len(self.states)) +
-                       ' states but was passed ' +
-                       str(len(initial_state)) +
-                       ' initial states.')
     timesteps = K.int_shape(inputs)[1]
 
     kwargs = {}
@@ -377,10 +315,9 @@ class ConvRNN2D(RNN):
         raise ValueError('RNN cell does not support constants')
 
       def step(inputs, states):
-        constants = states[-self._num_constants:]
-        states = states[:-self._num_constants]
-        return self.cell.call(inputs, states, constants=constants,
-                              **kwargs)
+        constants = states[-self._num_constants:]  # pylint: disable=invalid-unary-operand-type
+        states = states[:-self._num_constants]  # pylint: disable=invalid-unary-operand-type
+        return self.cell.call(inputs, states, constants=constants, **kwargs)
     else:
       def step(inputs, states):
         return self.cell.call(inputs, states, **kwargs)
diff --git a/tensorflow/python/keras/layers/convolutional_recurrent_test.py b/tensorflow/python/keras/layers/convolutional_recurrent_test.py
index d0da360ef5f..05d19e9ae16 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent_test.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent_test.py
@@ -202,6 +202,36 @@ class ConvLSTMTest(keras_parameterized.TestCase):
       outputs = clone.predict(test_inputs)
       self.assertAllClose(reference_outputs, outputs, atol=1e-5)
 
+  def test_conv_lstm_with_initial_state(self):
+    num_samples = 128
+    sequence_len = 10
+    encoder_inputs = keras.layers.Input((None, 32, 32, 3))
+    encoder = keras.layers.ConvLSTM2D(
+        filters=32, kernel_size=(3, 3), padding='same',
+        return_sequences=False, return_state=True)
+    _, state_h, state_c = encoder(encoder_inputs)
+    encoder_states = [state_h, state_c]
+
+    decoder_inputs = keras.layers.Input((None, 32, 32, 4))
+    decoder_lstm = keras.layers.ConvLSTM2D(
+        filters=32, kernel_size=(3, 3), padding='same',
+        return_sequences=False, return_state=False)
+    decoder_outputs = decoder_lstm(decoder_inputs, initial_state=encoder_states)
+    output = keras.layers.Conv2D(
+        1, (3, 3), padding='same', activation='relu')(decoder_outputs)
+    model = keras.Model([encoder_inputs, decoder_inputs], output)
+
+    model.compile(
+        optimizer='sgd', loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    x_1 = np.random.rand(num_samples, sequence_len, 32, 32, 3)
+    x_2 = np.random.rand(num_samples, sequence_len, 32, 32, 4)
+    y = np.random.rand(num_samples, 32, 32, 1)
+    model.fit([x_1, x_2], y)
+
+    model.predict([x_1, x_2])
+
 
 if __name__ == '__main__':
   test.main()

From 2b95bfb6d812d40c3ef9001c61068571b7c059c2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 16:06:44 -0800
Subject: [PATCH 290/442] Add MakeUnaryHlo() and MakeReverseHlo() to
 hlo_creation_utils.h/.cc

PiperOrigin-RevId: 296080049
Change-Id: I81d020a76da6820086a1a50379c77efc6c43918c
---
 .../compiler/xla/service/hlo_creation_utils.cc | 18 ++++++++++++++++++
 .../compiler/xla/service/hlo_creation_utils.h  | 10 ++++++++++
 2 files changed, 28 insertions(+)

diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
index 846b9cfbeb5..dd174772c62 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
@@ -33,6 +33,15 @@ limitations under the License.
 namespace xla {
 using absl::StrCat;
 
+StatusOr<HloInstruction*> MakeUnaryHlo(HloOpcode opcode,
+                                       HloInstruction* operand) {
+  HloComputation* computation = operand->parent();
+  TF_ASSIGN_OR_RETURN(Shape unary_op_shape,
+                      ShapeInference::InferUnaryOpShape(opcode, operand));
+  return computation->AddInstruction(
+      HloInstruction::CreateUnary(unary_op_shape, opcode, operand));
+}
+
 StatusOr<HloInstruction*> MakeBinaryHlo(HloOpcode opcode, HloInstruction* lhs,
                                         HloInstruction* rhs) {
   HloComputation* computation = lhs->parent();
@@ -344,6 +353,15 @@ StatusOr<HloInstruction*> MakeReduceHlo(HloInstruction* operand,
       scalar_shape, operand, init_value, all_dims, reduce_computation));
 }
 
+StatusOr<HloInstruction*> MakeReverseHlo(HloInstruction* operand,
+                                         absl::Span<const int64> dimensions) {
+  HloComputation* computation = operand->parent();
+  TF_ASSIGN_OR_RETURN(Shape reverse_shape, ShapeInference::InferReverseShape(
+                                               operand->shape(), dimensions));
+  return computation->AddInstruction(
+      HloInstruction::CreateReverse(reverse_shape, operand, dimensions));
+}
+
 StatusOr<HloInstruction*> MakeSelectHlo(HloInstruction* pred,
                                         HloInstruction* on_true,
                                         HloInstruction* on_false,
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.h b/tensorflow/compiler/xla/service/hlo_creation_utils.h
index 754f7e2be33..3f2e3aa25a1 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.h
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.h
@@ -27,6 +27,11 @@ namespace xla {
 // ergonomic.  We don't have a complete set of helpers yet -- I expect we'll
 // expand this interface as needed on an ad-hoc basis.
 
+// Creates a unary HLO instruction and adds it to the computation containing
+// `operand`.
+StatusOr<HloInstruction*> MakeUnaryHlo(HloOpcode opcode,
+                                       HloInstruction* operand);
+
 // Creates a binary HLO instruction and adds it to the computation containing
 // `lhs` and `rhs` (`lhs` and `rhs` must be in the same computation).
 StatusOr<HloInstruction*> MakeBinaryHlo(HloOpcode opcode, HloInstruction* lhs,
@@ -145,6 +150,11 @@ StatusOr<HloInstruction*> MakeReduceHlo(HloInstruction* operand,
                                         HloOpcode binary_opcode,
                                         HloModule* module);
 
+// Creates a Reverse HLO instruction and adds it to the computation containing
+// `operand`.
+StatusOr<HloInstruction*> MakeReverseHlo(HloInstruction* operand,
+                                         absl::Span<const int64> dimensions);
+
 // Creates a Select HLO instruction and adds it to the computation containing
 // the predicate. The on_true and on_false instructions must also be contained
 // in the same computation. If on_true and on_false are tuples, create a tuple

From edaaeaddbdf996a089b3041c0d8fe4677e37c9e0 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 19 Feb 2020 16:12:20 -0800
Subject: [PATCH 291/442] [TF:MLIR] Add canonicalization pattern to TransposeOp
 and compose a layout optimizer pipeline

PiperOrigin-RevId: 296081205
Change-Id: Ica9b311ba83e2e75b726eacbdc393c03692dacb8
---
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     | 63 ++++++++++++++++---
 .../mlir/tensorflow/tests/canonicalize.mlir   | 22 +++++++
 .../tensorflow/tests/layout_optimization.mlir | 24 +++++++
 .../transforms/layout_optimization.cc         | 47 ++++++++++++++
 4 files changed, 146 insertions(+), 10 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/layout_optimization.mlir

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 0d70d8793ee..c97f2ed5420 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -151,6 +151,26 @@ static bool AreCastCompatible(Type a, Type b) {
          b_kind == TensorFlowTypes::VARIANT;
 }
 
+static bool AreCancellablePermutations(DenseIntElementsAttr perm0,
+                                       DenseIntElementsAttr perm1) {
+  if (perm0.getNumElements() == 0 || perm1.getNumElements() == 0) return false;
+  if (perm0.getNumElements() != perm1.getNumElements()) return false;
+
+  SmallVector<int64_t, 8> perm0_values;
+  for (auto value : perm0.getIntValues())
+    perm0_values.push_back(value.getSExtValue());
+
+  SmallVector<int64_t, 8> perm1_values;
+  for (auto value : perm1.getIntValues())
+    perm1_values.push_back(value.getSExtValue());
+
+  for (int i = 0; i < perm0_values.size(); ++i) {
+    if (perm0_values[perm1_values[i]] != i) return false;
+  }
+
+  return true;
+}
+
 static bool IsUnknownDimOrRank(int64_t dim_or_rank) {
   return dim_or_rank == -1;
 }
@@ -2723,23 +2743,46 @@ void TransposeOp::build(Builder *builder, OperationState &result, Value x,
                             perm);
 }
 
-OpFoldResult TransposeOp::fold(ArrayRef<Attribute> operands) {
-  auto const_perm = dyn_cast_or_null<TF::ConstOp>(perm().getDefiningOp());
+namespace {
 
-  if (!const_perm) {
-    return {};
-  }
+OpFoldResult FoldIdentityTranspose(TransposeOp op) {
+  auto const_perm = dyn_cast_or_null<TF::ConstOp>(op.perm().getDefiningOp());
+  if (!const_perm) return {};
 
   auto const_value = const_perm.value();
-
   const auto &elements = const_value.getValues<APInt>();
+
   for (auto it : llvm::enumerate(elements)) {
-    if (it.index() != it.value()) {
-      return {};
-    }
+    if (it.index() != it.value()) return {};
   }
 
-  return x();
+  return op.x();
+}
+
+OpFoldResult FoldCancellableTranspose(TransposeOp op) {
+  // Operand is a TransposeOp.
+  auto transpose = dyn_cast_or_null<TF::TransposeOp>(op.x().getDefiningOp());
+  if (!transpose) return {};
+
+  // Permutations defined by constant operations.
+  auto perm0 = dyn_cast_or_null<TF::ConstOp>(op.perm().getDefiningOp());
+  auto perm1 = dyn_cast_or_null<TF::ConstOp>(transpose.perm().getDefiningOp());
+  if (!perm0 || !perm1) return {};
+
+  // With permutation indices that cancel each other
+  auto perm0_value = perm0.value().cast<DenseIntElementsAttr>();
+  auto perm1_value = perm1.value().cast<DenseIntElementsAttr>();
+  if (!AreCancellablePermutations(perm0_value, perm1_value)) return {};
+
+  return transpose.x();
+}
+
+}  // namespace
+
+OpFoldResult TransposeOp::fold(ArrayRef<Attribute> operands) {
+  if (auto folded = FoldIdentityTranspose(*this)) return folded;
+  if (auto folded = FoldCancellableTranspose(*this)) return folded;
+  return {};
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index c91c1e2f7b5..5bf5b0610ae 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -383,6 +383,28 @@ func @nonIdentityTranspose(%arg0: tensor<2x3x4x5x6xf32>) -> tensor<2x3x4x6x5xf32
   // CHECK: return %1
 }
 
+// CHECK-LABEL: @cancellableTranspose
+func @cancellableTranspose(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32> {
+  %0 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %1 = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %2 = "tf.Transpose"(%arg0, %0) : (tensor<1x4x4x8xf32>, tensor<4xi32>) -> tensor<1x8x4x4xf32>
+  %3 = "tf.Transpose"(%2, %1) : (tensor<1x8x4x4xf32>, tensor<4xi32>) -> tensor<1x4x4x8xf32>
+
+  return %3 : tensor<1x4x4x8xf32>
+  // CHECK: return %arg0
+}
+
+// CHECK-LABEL: @nonCancellableTranspose
+func @nonCancellableTranspose(%arg0: tensor<1x4x4x8xf32>) -> tensor<4x1x4x8xf32> {
+  %0 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %1 = "tf.Const"() {value = dense<[2, 0, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %2 = "tf.Transpose"(%arg0, %0) : (tensor<1x4x4x8xf32>, tensor<4xi32>) -> tensor<1x8x4x4xf32>
+  %3 = "tf.Transpose"(%2, %1) : (tensor<1x8x4x4xf32>, tensor<4xi32>) -> tensor<4x1x4x8xf32>
+
+  return %3 : tensor<4x1x4x8xf32>
+  // CHECK: return %3
+}
+
 // CHECK-LABEL: func @addN
 func @addN(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   // CHECK: return %arg0
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization.mlir
new file mode 100644
index 00000000000..44330d675e2
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization.mlir
@@ -0,0 +1,24 @@
+// RUN: tf-opt %s -tf-layout-optimization=force-data-format=NCHW -verify-diagnostics | FileCheck %s --dump-input=always
+
+// CHECK-LABEL: func @transposeBiasAdd
+func @transposeBiasAdd(%arg0: tensor<1x8x4x4xf32>, %arg1: tensor<8xf32>) -> tensor<1x8x4x4xf32> {
+
+  // Convert input: NCHW -> NHWC
+  %0 = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>} : () -> tensor<4xi64>
+  %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x8x4x4xf32>, tensor<4xi64>) -> tensor<1x4x4x8xf32>
+
+  // Compute in NHWC
+  %2 = "tf.BiasAdd"(%1, %arg1) {data_format = "NHWC"} : (tensor<1x4x4x8xf32>, tensor<8xf32>) -> tensor<1x4x4x8xf32>
+
+  // Convert result back: NHWC -> NCHW
+  %3 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+  %4 = "tf.Transpose"(%2, %3) : (tensor<1x4x4x8xf32>, tensor<4xi64>) -> tensor<1x8x4x4xf32>
+
+  // Check that BiasAdd computed in NCHW format, and all redundant transpose
+  // operations removed from the function.
+
+  // CHECK: %[[BIAS_ADD:[0-9]*]] = "tf.BiasAdd"(%arg0, %arg1) {data_format = "NCHW"} {{.*}} tensor<1x8x4x4xf32>
+  // CHECK: return %[[BIAS_ADD]]
+
+  return %4 : tensor<1x8x4x4xf32>
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
index ba46059e5b6..feef3516ade 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
@@ -18,7 +18,9 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // TF:llvm-project
 #include "mlir/IR/Function.h"  // TF:llvm-project
 #include "mlir/Pass/Pass.h"  // TF:llvm-project
+#include "mlir/Pass/PassManager.h"  // TF:llvm-project
 #include "mlir/Pass/PassRegistry.h"  // TF:llvm-project
+#include "mlir/Transforms/Passes.h"  // TF:llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 #define DEBUG_TYPE "tf-layout-optimization"
@@ -28,11 +30,25 @@ namespace TF {
 
 namespace {
 
+// Layout optimization pipeline composes layout assignment and move transposes
+// passes to pick the optimal layout for all layout sensitive operations, and
+// cancel all redundant transposes.
+struct LayoutOptimizationPipelineOptions
+    : public PassPipelineOptions<LayoutOptimizationPipelineOptions> {
+  Option<std::string> force_data_format{
+      *this, "force-data-format",
+      llvm::cl::desc("Force data format for all layout sensitive ops")};
+};
+
 // LayoutAssignmentPass assigns optimal data layout (data format) for all
 // layout sensitive operations.
 class LayoutAssignmentPass : public FunctionPass<LayoutAssignmentPass> {
  public:
   LayoutAssignmentPass() = default;
+  explicit LayoutAssignmentPass(const std::string& force_data_format) {
+    force_data_format_ = force_data_format;
+  }
+
   LayoutAssignmentPass(const LayoutAssignmentPass& pass) {}
 
   void runOnFunction() final;
@@ -52,6 +68,7 @@ class MoveTransposesPass : public FunctionPass<MoveTransposesPass> {
   enum class Direction { kBegin, kEnd };
 
   MoveTransposesPass() = default;
+  explicit MoveTransposesPass(Direction direction) { direction_ = direction; }
   MoveTransposesPass(const MoveTransposesPass& pass) {}
 
   void runOnFunction() final;
@@ -356,6 +373,30 @@ void MoveTransposesPass::runOnFunction() {
       MoveTransposeAfter(op, &work_list);
     }
   }
+
+  func.walk([&](TransposeOp transpose) {
+    OpBuilder builder(transpose);
+    SmallVector<Value, 1> fold_result;
+    if (succeeded(builder.tryFold(transpose.getOperation(), fold_result))) {
+      assert(fold_result.size() == 1);
+      transpose.replaceAllUsesWith(fold_result[0]);
+    }
+  });
+}
+
+void CreateLayoutOptimizationPipeline(
+    OpPassManager& pm,  // NOLINT - MLIR contract is pass by mutable reference.
+    const LayoutOptimizationPipelineOptions& options) {
+  using Direction = MoveTransposesPass::Direction;
+
+  // Assign optimal layout for layout sensitive ops.
+  pm.addPass(std::make_unique<LayoutAssignmentPass>(options.force_data_format));
+
+  // Move transposes to the beginning of the block and try to fold them.
+  pm.addPass(std::make_unique<MoveTransposesPass>(Direction::kBegin));
+
+  // Move transposes to the end of the block and try to fold them.
+  pm.addPass(std::make_unique<MoveTransposesPass>(Direction::kEnd));
 }
 
 }  // namespace
@@ -365,5 +406,11 @@ static PassRegistration<LayoutAssignmentPass> layout_assignment(
 static PassRegistration<MoveTransposesPass> move_transposes(
     "tf-move-transposes", "Move transposes pass");
 
+static mlir::PassPipelineRegistration<LayoutOptimizationPipelineOptions>
+    pipeline("tf-layout-optimization",
+             "Assigns optimal data layout to all layout sensitive operations "
+             "and cancel redundant transpose operations.",
+             CreateLayoutOptimizationPipeline);
+
 }  // namespace TF
 }  // namespace mlir

From ba2cbe1e5570a9b10f33cd6a0e57c0759c9d00d7 Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Wed, 19 Feb 2020 16:25:30 -0800
Subject: [PATCH 292/442] Avoid direct access to the env var
 TEST_UNDECLARED_OUTPUTS_DIR.

On Windows, Bazel populates this path with `/`s only making proper path
management impossible without sanitizing the path up front. This changes to
accessing the env var through an indirection layer which will fix path problems
on Windows when the codebase is ready to switch over.

PiperOrigin-RevId: 296083765
Change-Id: I26bbaf83ba5e3fafd3ab0a0de08f6cb597b94477
---
 .../mlir/tensorflow/utils/dump_mlir_util.cc   | 30 +++++++++----------
 .../mlir/tensorflow/utils/dump_mlir_util.h    |  2 +-
 .../compiler/tf2xla/mlir_bridge_pass.cc       |  5 ++--
 3 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
index ead26c8f17d..f06734a26bd 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // TF:llvm-project
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/path.h"
 
 namespace tensorflow {
 
@@ -97,18 +98,18 @@ struct WritableFileRawStream : public llvm::raw_ostream {
 Status CreateFileForDumping(llvm::StringRef name,
                             std::unique_ptr<llvm::raw_ostream>* os,
                             std::string* filepath, llvm::StringRef dirname) {
-  const char* dir = nullptr;
+  std::string dir;
   if (!dirname.empty())
-    dir = dirname.data();
+    dir = std::string(dirname);
   else
     dir = GetDumpDirFromEnvVar();
 
-  if (!dir) {
+  if (dir.empty()) {
     return Status(error::Code::INVALID_ARGUMENT,
                   "(TF_DUMP_GRAPH_PREFIX not specified)");
   }
 
-  if (std::strncmp(dir, "-", 2) == 0) {
+  if (dir == "-") {
     *os = std::make_unique<LogInfoRawStream>();
     *filepath = "LOG(INFO)";
     return Status();
@@ -151,25 +152,24 @@ std::string DumpMlirOpToFile(llvm::StringRef name, mlir::Operation* op,
   return filepath;
 }
 
-const char* GetDumpDirFromEnvVar() {
+std::string GetDumpDirFromEnvVar() {
   const char* prefix_env = getenv("TF_DUMP_GRAPH_PREFIX");
   if (!prefix_env) {
     LOG(WARNING)
         << "Failed to dump MLIR module because dump location is not "
         << " specified through TF_DUMP_GRAPH_PREFIX environment variable.";
-    return nullptr;
+    return "";
   }
 
-  if (absl::EqualsIgnoreCase(prefix_env, "sponge")) {
-    const char* tmp_dir = getenv("TEST_UNDECLARED_OUTPUTS_DIR");
-    if (!tmp_dir) {
-      LOG(WARNING) << "TF_DUMP_GRAPH_PREFIX=sponge but "
-                      "TEST_UNDECLARED_OUTPUT_DIRS is not set";
-      return nullptr;
-    }
-    return tmp_dir;
+  std::string result = prefix_env;
+
+  if (absl::EqualsIgnoreCase(result, "sponge") &&
+      !io::GetTestUndeclaredOutputsDir(&result)) {
+    LOG(WARNING) << "TF_DUMP_GRAPH_PREFIX=sponge but "
+                    "TEST_UNDECLARED_OUTPUT_DIRS is not set";
+    return "";
   }
-  return prefix_env;
+  return result;
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h
index 7c25a809089..14c0d1f0b6e 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h
@@ -54,7 +54,7 @@ std::string DumpMlirOpToFile(llvm::StringRef name, mlir::Operation* op,
 // Default is reading from TF_DUMP_GRAPH_PREFIX, and if the string is 'sponge'
 // read from TEST_UNDECLARED_OUTPUTS_DIR. Returns nullptr if the directory
 // cannot be determined and generates a warning message.
-const char* GetDumpDirFromEnvVar();
+std::string GetDumpDirFromEnvVar();
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
index a0ffd1908c5..7ac4cb8fb06 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
@@ -35,11 +35,10 @@ namespace tensorflow {
 // This require the TF_DUMP_GRAPH_PREFIX to be set to a path that exist (or can
 // be created).
 static void DumpModule(mlir::ModuleOp module, llvm::StringRef file_prefix) {
-  const char* prefix_env = GetDumpDirFromEnvVar();
-  if (!prefix_env) {
+  std::string prefix = GetDumpDirFromEnvVar();
+  if (prefix.empty()) {
     return;
   }
-  std::string prefix = prefix_env;
 
   auto* env = tensorflow::Env::Default();
   auto status = env->RecursivelyCreateDir(prefix);

From 3aecbb9fb163d72618524c98b5633ca521514387 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Wed, 19 Feb 2020 16:30:48 -0800
Subject: [PATCH 293/442] Supported F32_F16 precision in Winograd
 transformations.

PiperOrigin-RevId: 296084853
Change-Id: If7f1715d84eae34159cf403d1ad208f9d1aa7305
---
 .../lite/delegates/gpu/cl/kernels/util.cc     |  66 +++++++++
 .../lite/delegates/gpu/cl/kernels/util.h      |  25 ++++
 .../lite/delegates/gpu/cl/kernels/winograd.cc | 134 ++++++++++++------
 3 files changed, 182 insertions(+), 43 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
index 9b46c91b921..0943816f2d7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
@@ -16,10 +16,12 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 
 #include <cmath>
+#include <string>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/substitute.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 
 namespace tflite {
@@ -225,6 +227,37 @@ std::string TensorCodeGenerator::ReadAsFloatWHDSB(
                      address_mode);
 }
 
+std::string TensorCodeGenerator::ReadAsTypeWHS(
+    DataType type, const std::string& x, const std::string& y,
+    const std::string& s, TextureAddressMode address_mode) const {
+  return ReadAsType(type, GetGlobalAddressNoDeclarationWHS(x, y, s),
+                    address_mode);
+}
+
+std::string TensorCodeGenerator::ReadAsTypeWHSB(
+    DataType type, const std::string& x, const std::string& y,
+    const std::string& s, const std::string& b,
+    TextureAddressMode address_mode) const {
+  return ReadAsType(type, GetGlobalAddressNoDeclarationWHSB(x, y, s, b),
+                    address_mode);
+}
+
+std::string TensorCodeGenerator::ReadAsTypeWHDS(
+    DataType type, const std::string& x, const std::string& y,
+    const std::string& z, const std::string& s,
+    TextureAddressMode address_mode) const {
+  return ReadAsType(type, GetGlobalAddressNoDeclarationWHDS(x, y, z, s),
+                    address_mode);
+}
+
+std::string TensorCodeGenerator::ReadAsTypeWHDSB(
+    DataType type, const std::string& x, const std::string& y,
+    const std::string& z, const std::string& s, const std::string& b,
+    TextureAddressMode address_mode) const {
+  return ReadAsType(type, GetGlobalAddressNoDeclarationWHDSB(x, y, z, s, b),
+                    address_mode);
+}
+
 std::string TensorCodeGenerator::GetAddressWHS(const std::string& var_name,
                                                const std::string& x,
                                                const std::string& y,
@@ -449,6 +482,39 @@ std::string TensorCodeGenerator::ReadAsFloat(
   }
 }
 
+std::string TensorCodeGenerator::ReadAsType(
+    DataType type, const std::string& global_address,
+    TextureAddressMode address_mode) const {
+  const std::string read_as =
+      type == DataType::FLOAT16 ? "read_imageh" : "read_imagef";
+  switch (descriptor_.storage_type) {
+    case TensorStorageType::BUFFER: {
+      const std::string reading =
+          absl::StrCat(tensor_name_, "[", global_address, "]");
+      if (type == descriptor_.data_type) {
+        return reading;
+      } else {
+        const std::string conversion =
+            type == DataType::FLOAT16 ? "convert_half4" : "convert_float4";
+        return absl::StrCat(conversion, "(", reading, ")");
+      }
+    }
+    case TensorStorageType::TEXTURE_2D:
+    case TensorStorageType::TEXTURE_3D:
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+    case TensorStorageType::TEXTURE_ARRAY:
+      return absl::StrCat(
+          read_as, "(", tensor_name_,
+          ", " + TextureAddressModeToString(address_mode) + ", ",
+          global_address, ")");
+    case TensorStorageType::IMAGE_BUFFER:
+      return absl::StrCat(read_as, "(", tensor_name_, ", ", global_address,
+                          ")");
+    case TensorStorageType::UNKNOWN:
+      return "";
+  }
+}
+
 std::string TensorCodeGenerator::Write(
     const std::string& var_name, const std::string& global_address) const {
   switch (descriptor_.storage_type) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.h b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
index 14ad9ec0bc3..02d5df6c442 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
@@ -138,6 +138,28 @@ class TensorCodeGenerator {
       const std::string& s, const std::string& b,
       TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const;
 
+  // Optimization for textures, so as in opencl we can use read_imagef for any
+  // texture type.
+  std::string ReadAsTypeWHS(
+      DataType type, const std::string& x, const std::string& y,
+      const std::string& s,
+      TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const;
+
+  std::string ReadAsTypeWHSB(
+      DataType type, const std::string& x, const std::string& y,
+      const std::string& s, const std::string& b,
+      TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const;
+
+  std::string ReadAsTypeWHDS(
+      DataType type, const std::string& x, const std::string& y,
+      const std::string& z, const std::string& s,
+      TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const;
+
+  std::string ReadAsTypeWHDSB(
+      DataType type, const std::string& x, const std::string& y,
+      const std::string& z, const std::string& s, const std::string& b,
+      TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const;
+
   std::string WriteWHS(const std::string& var_name, const std::string& x,
                        const std::string& y, const std::string& s) const;
 
@@ -161,6 +183,9 @@ class TensorCodeGenerator {
   std::string ReadAsFloat(
       const std::string& global_address,
       TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const;
+  std::string ReadAsType(
+      DataType type, const std::string& global_address,
+      TextureAddressMode address_mode = TextureAddressMode::DONT_CARE) const;
   std::string Write(const std::string& var_name,
                     const std::string& global_address) const;
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
index cfc172055ab..868cca55882 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
@@ -49,8 +51,22 @@ std::string GetWinograd4x4To36Code(
       src_tensor_type == TensorStorageType::IMAGE_BUFFER;
   const bool is_buffer = src_tensor_type == TensorStorageType::BUFFER;
 
+  switch (op_def.precision) {
+    case CalculationsPrecision::F32:
+    case CalculationsPrecision::F32_F16:
+      c += "#define ACCUM_FLT float\n";
+      break;
+    case CalculationsPrecision::F16:
+      c += "#define ACCUM_FLT half\n";
+      break;
+  }
+
+  const DataType accum_type = op_def.precision == CalculationsPrecision::F16
+                                  ? DataType::FLOAT16
+                                  : DataType::FLOAT32;
+
   auto bt_mat = BtMatrixForWinograd4x4To6x6();
-  c += "constant FLT Bt[36] = {\n";
+  c += "constant ACCUM_FLT Bt[36] = {\n";
   for (int y = 0; y < 6; ++y) {
     c += "\t";
     for (int x = 0; x < 6; ++x) {
@@ -79,10 +95,12 @@ std::string GetWinograd4x4To36Code(
   c += "  }\n";
   c += "  int tile_x = (DST_X % tiles_x) * 4;\n";
   c += "  int tile_y = (DST_X / tiles_x) * 4;\n";
-  c += "  FLT4 I0, I1, I2, I3, I4, I5;\n";
-  c += "  FLT bt_ar[6];\n";
-  c += "  FLT4 t0 = " + bt_arr.ReadLinearFLT4("DST_Y * 2 + 0") + ";\n";
-  c += "  FLT4 t1 = " + bt_arr.ReadLinearFLT4("DST_Y * 2 + 1") + ";\n";
+  c += "  ACCUM_FLT4 I0, I1, I2, I3, I4, I5;\n";
+  c += "  ACCUM_FLT bt_ar[6];\n";
+  c += "  ACCUM_FLT4 t0 = TO_ACCUM_TYPE(" +
+       bt_arr.ReadLinearFLT4("DST_Y * 2 + 0") + ");\n";
+  c += "  ACCUM_FLT4 t1 = TO_ACCUM_TYPE(" +
+       bt_arr.ReadLinearFLT4("DST_Y * 2 + 1") + ");\n";
   c += "  DST_Y *= 6;\n";
   c += "  bt_ar[0] = t0.x;\n";
   c += "  bt_ar[1] = t0.y;\n";
@@ -92,15 +110,17 @@ std::string GetWinograd4x4To36Code(
   c += "  bt_ar[5] = t1.y;\n";
   auto read_src = [&](const std::string& src, const std::string& xs) {
     if (is_image_buffer) {
-      c += "    FLT4 " + src + " = " +
-           src_tensor.Read("src_a_" + xs + " + offset") + ";\n";
+      c += "    ACCUM_FLT4 " + src + " = " +
+           src_tensor.ReadAsType(accum_type, "src_a_" + xs + " + offset") +
+           ";\n";
     } else if (is_buffer) {
-      c += "    FLT4 " + src + " = " +
-           src_tensor.Read("src_a_" + xs + " + offset") + " * m" + xs + "_x;\n";
+      c += "    ACCUM_FLT4 " + src + " = " +
+           src_tensor.ReadAsType(accum_type, "src_a_" + xs + " + offset") +
+           " * m" + xs + "_x;\n";
     } else {
-      c += "    FLT4 " + src + " = " +
-           src_tensor.ReadWHSB("tile_x + padding.x + " + xs, "yc", "DST_Z",
-                               batch_id) +
+      c += "    ACCUM_FLT4 " + src + " = " +
+           src_tensor.ReadAsTypeWHSB(accum_type, "tile_x + padding.x + " + xs,
+                                     "yc", "DST_Z", batch_id) +
            ";\n";
     }
   };
@@ -108,8 +128,8 @@ std::string GetWinograd4x4To36Code(
     for (int x = 0; x < 6; ++x) {
       const std::string xs = std::to_string(x);
       c += "  int xc" + xs + " = tile_x + padding.x + " + xs + ";\n";
-      c += "  FLT m" + xs + "_x = (FLT)(xc" + xs + " >= 0 && xc" + xs +
-           " < src_size.x);\n";
+      c += "  ACCUM_FLT m" + xs + "_x = (ACCUM_FLT)(xc" + xs + " >= 0 && xc" +
+           xs + " < src_size.x);\n";
       c += "  bool inx" + xs + " = (xc" + xs + " >= 0 && xc" + xs +
            " < src_size.x);\n";
       c += "  xc" + xs + " = clamp(xc" + xs + ", 0, src_size.x - 1);\n";
@@ -126,9 +146,9 @@ std::string GetWinograd4x4To36Code(
   if (is_buffer || is_image_buffer) {
     c += "    bool iny = (yc >= 0 && yc < src_size.y);\n";
     c += "    int offset = select(0, yc * src_size.x, iny);\n";
-    c += "    FLT bt = bt_ar[0] * (FLT)(iny);\n";
+    c += "    ACCUM_FLT bt = bt_ar[0] * (ACCUM_FLT)(iny);\n";
   } else {
-    c += "    FLT bt = bt_ar[0];\n";
+    c += "    ACCUM_FLT bt = bt_ar[0];\n";
   }
   for (int x = 0; x < 6; ++x) {
     const std::string xs = std::to_string(x);
@@ -144,9 +164,9 @@ std::string GetWinograd4x4To36Code(
     if (is_buffer || is_image_buffer) {
       c += "    bool iny = (yc >= 0 && yc < src_size.y);\n";
       c += "    int offset = select(0, yc * src_size.x, iny);\n";
-      c += "    FLT bt = bt_ar[" + ys + "] * (FLT)(iny);\n";
+      c += "    ACCUM_FLT bt = bt_ar[" + ys + "] * (ACCUM_FLT)(iny);\n";
     } else {
-      c += "    FLT bt = bt_ar[" + ys + "];\n";
+      c += "    ACCUM_FLT bt = bt_ar[" + ys + "];\n";
     }
     for (int x = 0; x < 6; ++x) {
       const std::string xs = std::to_string(x);
@@ -158,42 +178,50 @@ std::string GetWinograd4x4To36Code(
   }
   const LinkingContext context{"r0", "DST_X", "DST_Y", "DST_Z"};
   c += "  {\n";
-  c += "    FLT4 r0 = I0 + Bt[2] * I2 + Bt[4] * I4;\n";
+  c += "    FLT4 r0 = TO_FLT4(I0 + Bt[2] * I2 + Bt[4] * I4);\n";
   c += PostProcess(linked_operations, context);
   c += "    " + dst_tensor.WriteWHSB("r0", "DST_X", "DST_Y", "DST_Z", batch_id);
   c += "    DST_Y++;\n";
   c += "  }\n";
   c += "  {\n";
-  c += "    FLT4 r0 = Bt[7] * I1 + Bt[8] * I2 + Bt[9] * I3 + Bt[10] * I4;\n";
+  c += "    FLT4 r0 = TO_FLT4(Bt[7] * I1 + Bt[8] * I2 + Bt[9] * I3 + Bt[10] * "
+       "I4);\n";
   c += PostProcess(linked_operations, context);
   c += "    " + dst_tensor.WriteWHSB("r0", "DST_X", "DST_Y", "DST_Z", batch_id);
   c += "    DST_Y++;\n";
   c += "  }\n";
   c += "  {\n";
-  c += "    FLT4 r0 = Bt[13] * I1 + Bt[14] * I2 + Bt[15] * I3 + Bt[16] * I4;\n";
+  c += "    FLT4 r0 = TO_FLT4(Bt[13] * I1 + Bt[14] * I2 + Bt[15] * I3 + Bt[16] "
+       "* "
+       "I4);\n";
   c += PostProcess(linked_operations, context);
   c += "    " + dst_tensor.WriteWHSB("r0", "DST_X", "DST_Y", "DST_Z", batch_id);
   c += "    DST_Y++;\n";
   c += "  }\n";
   c += "  {\n";
-  c += "    FLT4 r0 = Bt[19] * I1 + Bt[20] * I2 + Bt[21] * I3 + Bt[22] * I4;\n";
+  c += "    FLT4 r0 = TO_FLT4(Bt[19] * I1 + Bt[20] * I2 + Bt[21] * I3 + Bt[22] "
+       "* "
+       "I4);\n";
   c += PostProcess(linked_operations, context);
   c += "    " + dst_tensor.WriteWHSB("r0", "DST_X", "DST_Y", "DST_Z", batch_id);
   c += "    DST_Y++;\n";
   c += "  }\n";
   c += "  {\n";
-  c += "    FLT4 r0 = Bt[25] * I1 + Bt[26] * I2 + Bt[27] * I3 + Bt[28] * I4;\n";
+  c += "    FLT4 r0 = TO_FLT4(Bt[25] * I1 + Bt[26] * I2 + Bt[27] * I3 + Bt[28] "
+       "* "
+       "I4);\n";
   c += PostProcess(linked_operations, context);
   c += "    " + dst_tensor.WriteWHSB("r0", "DST_X", "DST_Y", "DST_Z", batch_id);
   c += "    DST_Y++;\n";
   c += "  }\n";
   c += "  {\n";
-  c += "    FLT4 r0 = Bt[31] * I1 + Bt[33] * I3 + I5;\n";
+  c += "    FLT4 r0 = TO_FLT4(Bt[31] * I1 + Bt[33] * I3 + I5);\n";
   c += PostProcess(linked_operations, context);
   c += "    " + dst_tensor.WriteWHSB("r0", "DST_X", "DST_Y", "DST_Z", batch_id);
   c += "    DST_Y++;\n";
   c += "  }\n";
   c += "}\n";
+  // std::cout << c << std::endl;
   return c;
 }
 
@@ -213,8 +241,22 @@ std::string GetWinograd36To4x4Code(
   const std::string batch_id = op_def.IsBatchSupported() ? "batch_id" : "";
   std::string c = GetCommonDefines(op_def.precision);
 
+  switch (op_def.precision) {
+    case CalculationsPrecision::F32:
+    case CalculationsPrecision::F32_F16:
+      c += "#define ACCUM_FLT float\n";
+      break;
+    case CalculationsPrecision::F16:
+      c += "#define ACCUM_FLT half\n";
+      break;
+  }
+
+  const DataType accum_type = op_def.precision == CalculationsPrecision::F16
+                                  ? DataType::FLOAT16
+                                  : DataType::FLOAT32;
+
   auto at_mat = AtMatrixForWinograd4x4To6x6();
-  c += "constant FLT At[24] = {\n";
+  c += "constant ACCUM_FLT At[24] = {\n";
   for (int y = 0; y < 4; ++y) {
     c += "\t";
     for (int x = 0; x < 6; ++x) {
@@ -243,10 +285,12 @@ std::string GetWinograd36To4x4Code(
        "dst_size.z) {\n";
   c += "    return; \n";
   c += "  }\n";
-  c += "  FLT4 I0, I1, I2, I3, I4, I5;\n";
-  c += "  FLT at_ar[6];\n";
-  c += "  FLT4 t00 = " + at_arr.ReadLinearFLT4("DST_Y * 2 + 0") + ";\n";
-  c += "  FLT4 t01 = " + at_arr.ReadLinearFLT4("DST_Y * 2 + 1") + ";\n";
+  c += "  ACCUM_FLT4 I0, I1, I2, I3, I4, I5;\n";
+  c += "  ACCUM_FLT at_ar[6];\n";
+  c += "  ACCUM_FLT4 t00 = TO_ACCUM_TYPE(" +
+       at_arr.ReadLinearFLT4("DST_Y * 2 + 0") + ");\n";
+  c += "  ACCUM_FLT4 t01 = TO_ACCUM_TYPE(" +
+       at_arr.ReadLinearFLT4("DST_Y * 2 + 1") + ");\n";
   c += "  at_ar[0] = t00.x;\n";
   c += "  at_ar[1] = t00.y;\n";
   c += "  at_ar[2] = t00.z;\n";
@@ -254,56 +298,60 @@ std::string GetWinograd36To4x4Code(
   c += "  at_ar[4] = t01.x;\n";
   c += "  at_ar[5] = t01.y;\n";
   c += "  {\n";
-  c += "    FLT at = at_ar[0];\n";
+  c += "    ACCUM_FLT at = at_ar[0];\n";
   for (int x = 0; x < 6; ++x) {
     const std::string yc = std::to_string(x);
     const std::string src = "src" + std::to_string(x);
-    c += "    FLT4 " + src + " = " +
-         src_tensor.ReadWHSB("tile_id", yc, "DST_Z", batch_id) + ";\n";
+    c += "    ACCUM_FLT4 " + src + " = " +
+         src_tensor.ReadAsTypeWHSB(accum_type, "tile_id", yc, "DST_Z",
+                                   batch_id) +
+         ";\n";
     c += "    I" + std::to_string(x) + " = at * " + src + ";\n";
   }
   c += "  }\n";
   for (int y = 1; y < 6; ++y) {
     c += "  {\n";
-    c += "    FLT at = at_ar[" + std::to_string(y) + "];\n";
+    c += "    ACCUM_FLT at = at_ar[" + std::to_string(y) + "];\n";
     for (int x = 0; x < 6; ++x) {
       const std::string yc = std::to_string(y * 6 + x);
       const std::string src = "src" + std::to_string(x);
-      c += "    FLT4 " + src + " = " +
-           src_tensor.ReadWHSB("tile_id", yc, "DST_Z", batch_id) + ";\n";
+      c += "    ACCUM_FLT4 " + src + " = " +
+           src_tensor.ReadAsTypeWHSB(accum_type, "tile_id", yc, "DST_Z",
+                                     batch_id) +
+           ";\n";
       c += "    I" + std::to_string(x) + " += at * " + src + ";\n";
     }
     c += "  }\n";
   }
-  c += "  FLT4 t0 = I1 + I2;\n";
-  c += "  FLT4 t1 = I3 + I4;\n";
+  c += "  ACCUM_FLT4 t0 = I1 + I2;\n";
+  c += "  ACCUM_FLT4 t1 = I3 + I4;\n";
   c += "  FLT4 bias_val = " + biases.ReadLinearFLT4("DST_Z") + ";\n";
   c += "  {\n";
   const LinkingContext context{"r0", "tile_x", "tile_y", "DST_Z"};
-  c += "    FLT4 r0 = I0 + t0 + t1 + bias_val;\n";
+  c += "    FLT4 r0 = TO_FLT4(I0 + t0 + t1) + bias_val;\n";
   c += PostProcess(linked_operations, context);
   c += "    " +
        dst_tensor.WriteWHSB("r0", "tile_x", "tile_y", "DST_Z", batch_id);
   c += "    tile_x++;\n";
   c += "  }\n";
-  c += "  FLT4 t2 = I1 - I2;\n";
-  c += "  FLT4 t3 = I3 - I4;\n";
+  c += "  ACCUM_FLT4 t2 = I1 - I2;\n";
+  c += "  ACCUM_FLT4 t3 = I3 - I4;\n";
   c += "  if (tile_x < dst_size.x) {\n";
-  c += "    FLT4 r0 = t2 * At[7] + t3 * At[9] + bias_val;\n";
+  c += "    FLT4 r0 = TO_FLT4(t2 * At[7] + t3 * At[9]) + bias_val;\n";
   c += PostProcess(linked_operations, context);
   c += "    " +
        dst_tensor.WriteWHSB("r0", "tile_x", "tile_y", "DST_Z", batch_id);
   c += "    tile_x++;\n";
   c += "  }\n";
   c += "  if (tile_x < dst_size.x) {\n";
-  c += "    FLT4 r0 = t0 * At[13] + t1 * At[15] + bias_val;\n";
+  c += "    FLT4 r0 = TO_FLT4(t0 * At[13] + t1 * At[15]) + bias_val;\n";
   c += PostProcess(linked_operations, context);
   c += "    " +
        dst_tensor.WriteWHSB("r0", "tile_x", "tile_y", "DST_Z", batch_id);
   c += "    tile_x++;\n";
   c += "  }\n";
   c += "  if (tile_x < dst_size.x) {\n";
-  c += "    FLT4 r0 = t2 * At[19] + t3 * At[21] + I5 + bias_val;\n";
+  c += "    FLT4 r0 = TO_FLT4(t2 * At[19] + t3 * At[21] + I5) + bias_val;\n";
   c += PostProcess(linked_operations, context);
   c += "    " +
        dst_tensor.WriteWHSB("r0", "tile_x", "tile_y", "DST_Z", batch_id);

From 38168415ea5bda4c04da6d55272354274da9bc52 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Wed, 19 Feb 2020 16:32:18 -0800
Subject: [PATCH 294/442] FullyConnectedTexture renamed to FullyConnected so as
 support all storage types.

PiperOrigin-RevId: 296085184
Change-Id: I3ea56947c7ddf70370c10b4375903880fd3d83c9
---
 .../lite/delegates/gpu/cl/kernels/BUILD       | 14 +++----
 ...onnected_texture.cc => fully_connected.cc} | 23 ++++++-----
 ..._connected_texture.h => fully_connected.h} | 39 ++++++++++---------
 ...exture_test.cc => fully_connected_test.cc} | 10 ++---
 .../lite/delegates/gpu/cl/selectors/BUILD     |  2 +-
 .../cl/selectors/fully_connected_selector.cc  | 20 +++++-----
 6 files changed, 54 insertions(+), 54 deletions(-)
 rename tensorflow/lite/delegates/gpu/cl/kernels/{fully_connected_texture.cc => fully_connected.cc} (88%)
 rename tensorflow/lite/delegates/gpu/cl/kernels/{fully_connected_texture.h => fully_connected.h} (80%)
 rename tensorflow/lite/delegates/gpu/cl/kernels/{fully_connected_texture_test.cc => fully_connected_test.cc} (90%)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index 6b9bf5ce6e8..4076213cd23 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -731,9 +731,9 @@ cc_library(
 )
 
 cc_library(
-    name = "fully_connected_texture",
-    srcs = ["fully_connected_texture.cc"],
-    hdrs = ["fully_connected_texture.h"],
+    name = "fully_connected",
+    srcs = ["fully_connected.cc"],
+    hdrs = ["fully_connected.h"],
     deps = [
         ":gpu_operation",
         ":util",
@@ -751,8 +751,8 @@ cc_library(
 )
 
 cc_test(
-    name = "fully_connected_texture_test",
-    srcs = ["fully_connected_texture_test.cc"],
+    name = "fully_connected_test",
+    srcs = ["fully_connected_test.cc"],
     linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "linux",
@@ -760,7 +760,7 @@ cc_test(
     ],
     deps = [
         ":cl_test",
-        ":fully_connected_texture",
+        ":fully_connected",
         "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
@@ -1386,7 +1386,7 @@ test_suite(
         "depth_wise_conv_3x3_test",
         "depth_wise_conv_test",
         "elementwise_test",
-        "fully_connected_texture_test",
+        "fully_connected_test",
         "lstm_test",
         "max_unpooling_test",
         "multiply_add_test",
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.cc b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
similarity index 88%
rename from tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.cc
rename to tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
index d7192497661..e235a4f0edd 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h"
 
 #include <string>
 #include <utility>
@@ -90,18 +90,17 @@ std::string GetFullyConnectedKernelCode(
 }
 }  // namespace
 
-FullyConnectedTexture::FullyConnectedTexture(const OperationDef& definition)
+FullyConnected::FullyConnected(const OperationDef& definition)
     : GPUOperation(definition) {}
 
-FullyConnectedTexture::FullyConnectedTexture(FullyConnectedTexture&& kernel)
+FullyConnected::FullyConnected(FullyConnected&& kernel)
     : GPUOperation(std::move(kernel)),
       weights_(std::move(kernel.weights_)),
       biases_(std::move(kernel.biases_)),
       kernel_(std::move(kernel.kernel_)),
       work_group_size_(kernel.work_group_size_) {}
 
-FullyConnectedTexture& FullyConnectedTexture::operator=(
-    FullyConnectedTexture&& kernel) {
+FullyConnected& FullyConnected::operator=(FullyConnected&& kernel) {
   if (this != &kernel) {
     weights_ = std::move(kernel.weights_);
     biases_ = std::move(kernel.biases_);
@@ -112,7 +111,7 @@ FullyConnectedTexture& FullyConnectedTexture::operator=(
   return *this;
 }
 
-Status FullyConnectedTexture::Compile(const CreationContext& creation_context) {
+Status FullyConnected::Compile(const CreationContext& creation_context) {
   int wg_width = 32;
   int wg_height = 4;
   int work_items;
@@ -136,7 +135,7 @@ Status FullyConnectedTexture::Compile(const CreationContext& creation_context) {
   return OkStatus();
 }
 
-Status FullyConnectedTexture::AddToQueue(CLCommandQueue* queue) {
+Status FullyConnected::AddToQueue(CLCommandQueue* queue) {
   kernel_.ResetBindingCounter();
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(weights_.GetMemoryPtr()));
@@ -150,11 +149,11 @@ Status FullyConnectedTexture::AddToQueue(CLCommandQueue* queue) {
                                  work_group_size_);
 }
 
-Status CreateFullyConnectedTexture(const CreationContext& creation_context,
-                                   const OperationDef& definition,
-                                   const FullyConnectedAttributes& attr,
-                                   FullyConnectedTexture* result) {
-  *result = FullyConnectedTexture(definition);
+Status CreateFullyConnected(const CreationContext& creation_context,
+                            const OperationDef& definition,
+                            const FullyConnectedAttributes& attr,
+                            FullyConnected* result) {
+  *result = FullyConnected(definition);
   RETURN_IF_ERROR(
       result->UploadWeights(attr.weights, creation_context.context));
   LinearStorageCreateInfo create_info;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.h b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
similarity index 80%
rename from tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.h
rename to tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
index d3c88620ec0..83ac279a71b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FULLY_CONNECTED_TEXTURE_H_
-#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FULLY_CONNECTED_TEXTURE_H_
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FULLY_CONNECTED_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FULLY_CONNECTED_H_
 
 #include <vector>
 
@@ -34,24 +34,25 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class FullyConnectedTexture : public GPUOperation {
+class FullyConnected : public GPUOperation {
  public:
-  FullyConnectedTexture() = default;
+  FullyConnected() = default;
   Status AddToQueue(CLCommandQueue* queue) override;
 
   Status Compile(const CreationContext& creation_context) override;
 
   // Move only
-  FullyConnectedTexture(FullyConnectedTexture&& kernel);
-  FullyConnectedTexture& operator=(FullyConnectedTexture&& kernel);
-  FullyConnectedTexture(const FullyConnectedTexture&) = delete;
-  FullyConnectedTexture& operator=(const FullyConnectedTexture&) = delete;
+  FullyConnected(FullyConnected&& kernel);
+  FullyConnected& operator=(FullyConnected&& kernel);
+  FullyConnected(const FullyConnected&) = delete;
+  FullyConnected& operator=(const FullyConnected&) = delete;
 
  private:
-  explicit FullyConnectedTexture(const OperationDef& definition);
-  friend Status CreateFullyConnectedTexture(
-      const CreationContext& creation_context, const OperationDef& definition,
-      const FullyConnectedAttributes& attr, FullyConnectedTexture* result);
+  explicit FullyConnected(const OperationDef& definition);
+  friend Status CreateFullyConnected(const CreationContext& creation_context,
+                                     const OperationDef& definition,
+                                     const FullyConnectedAttributes& attr,
+                                     FullyConnected* result);
 
   template <DataType T>
   Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
@@ -68,7 +69,7 @@ class FullyConnectedTexture : public GPUOperation {
 };
 
 template <DataType T>
-Status FullyConnectedTexture::UploadWeights(
+Status FullyConnected::UploadWeights(
     const ::tflite::gpu::Tensor<OHWI, T>& weights, CLContext* context) {
   const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
   const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
@@ -92,7 +93,7 @@ Status FullyConnectedTexture::UploadWeights(
 }
 
 template <DataType T, typename S>
-void FullyConnectedTexture::RearrangeWeights(
+void FullyConnected::RearrangeWeights(
     const ::tflite::gpu::Tensor<OHWI, T>& weights, absl::Span<S> dst) {
   const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
   const int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
@@ -122,13 +123,13 @@ void FullyConnectedTexture::RearrangeWeights(
   }
 }
 
-Status CreateFullyConnectedTexture(const CreationContext& creation_context,
-                                   const OperationDef& definition,
-                                   const FullyConnectedAttributes& attr,
-                                   FullyConnectedTexture* result);
+Status CreateFullyConnected(const CreationContext& creation_context,
+                            const OperationDef& definition,
+                            const FullyConnectedAttributes& attr,
+                            FullyConnected* result);
 
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FULLY_CONNECTED_TEXTURE_H_
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FULLY_CONNECTED_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc
similarity index 90%
rename from tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture_test.cc
rename to tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc
index 0457142d707..4525d49e783 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h"
 
 #include <vector>
 
@@ -31,7 +31,7 @@ namespace gpu {
 namespace cl {
 namespace {
 
-TEST_F(OpenCLOperationTest, FullyConnectedTexture) {
+TEST_F(OpenCLOperationTest, FullyConnected) {
   TensorFloat32 src_tensor;
   src_tensor.shape = BHWC(1, 1, 1, 4);
   src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
@@ -51,9 +51,9 @@ TEST_F(OpenCLOperationTest, FullyConnectedTexture) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      FullyConnectedTexture operation;
-      ASSERT_OK(CreateFullyConnectedTexture(creation_context_, op_def, attr,
-                                            &operation));
+      FullyConnected operation;
+      ASSERT_OK(
+          CreateFullyConnected(creation_context_, op_def, attr, &operation));
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 1, 2), &dst_tensor));
       EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {14.5f, 37.5f}));
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
index c6a6902dacc..293a34df4a5 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
@@ -66,7 +66,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_buffer_1x1",
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_powervr",
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_texture",
-        "//tensorflow/lite/delegates/gpu/cl/kernels:fully_connected_texture",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:fully_connected",
         "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
index f4ea5886499..05d28b412ad 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_texture.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
@@ -36,10 +36,10 @@ Status SelectFullyConnectedAdreno(const FullyConnectedAttributes& attr,
     RETURN_IF_ERROR(CreateConvTexture(creation_context, op_def, attr, &conv));
     *ptr = absl::make_unique<ConvTexture>(std::move(conv));
   } else {
-    FullyConnectedTexture fc;
+    FullyConnected fc;
     RETURN_IF_ERROR(
-        CreateFullyConnectedTexture(creation_context, op_def, attr, &fc));
-    *ptr = absl::make_unique<FullyConnectedTexture>(std::move(fc));
+        CreateFullyConnected(creation_context, op_def, attr, &fc));
+    *ptr = absl::make_unique<FullyConnected>(std::move(fc));
   }
   return OkStatus();
 }
@@ -53,10 +53,10 @@ Status SelectFullyConnectedPowerVR(const FullyConnectedAttributes& attr,
     RETURN_IF_ERROR(CreateConvPowerVR(creation_context, op_def, attr, &conv));
     *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
   } else {
-    FullyConnectedTexture fc;
+    FullyConnected fc;
     RETURN_IF_ERROR(
-        CreateFullyConnectedTexture(creation_context, op_def, attr, &fc));
-    *ptr = absl::make_unique<FullyConnectedTexture>(std::move(fc));
+        CreateFullyConnected(creation_context, op_def, attr, &fc));
+    *ptr = absl::make_unique<FullyConnected>(std::move(fc));
   }
   return OkStatus();
 }
@@ -77,10 +77,10 @@ Status SelectFullyConnectedMali(const FullyConnectedAttributes& attr,
       *ptr = absl::make_unique<ConvTexture>(std::move(conv));
     }
   } else {
-    FullyConnectedTexture fc;
+    FullyConnected fc;
     RETURN_IF_ERROR(
-        CreateFullyConnectedTexture(creation_context, op_def, attr, &fc));
-    *ptr = absl::make_unique<FullyConnectedTexture>(std::move(fc));
+        CreateFullyConnected(creation_context, op_def, attr, &fc));
+    *ptr = absl::make_unique<FullyConnected>(std::move(fc));
   }
   return OkStatus();
 }

From 0dd277c6746fa71a314d53e39e0cd1fe4aa931ff Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Wed, 19 Feb 2020 16:36:25 -0800
Subject: [PATCH 295/442] Make TPUStrategy work with
 tf.function(experimental_compile=True). This involves two changes:

1. Only create replicated var handle inside TPUReplicateContext.
2. If the function annotated with experimental_compile=True is called inside a XLAControlFlowContext, don't create a new XLAControlFlowContext.

PiperOrigin-RevId: 296086034
Change-Id: I821f3b3cd5ba69cd4c7bdb9c28e13e4b4c83f967
---
 tensorflow/python/distribute/BUILD            |  1 +
 .../custom_training_loop_models_test.py       | 44 +++++++++++++++++++
 tensorflow/python/distribute/values.py        |  5 ++-
 tensorflow/python/eager/BUILD                 |  1 +
 tensorflow/python/eager/def_function.py       |  8 +++-
 5 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index bc6865c8617..a4e2795ce2e 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -620,6 +620,7 @@ py_library(
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/tpu:tpu_lib",
         "//tensorflow/python/training/tracking:base",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/python/distribute/custom_training_loop_models_test.py b/tensorflow/python/distribute/custom_training_loop_models_test.py
index dcce40a2f80..6fafa43677c 100644
--- a/tensorflow/python/distribute/custom_training_loop_models_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_models_test.py
@@ -354,6 +354,50 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
     with distribution.scope():
       model = CustomModel()
 
+    @def_function.function
+    def train_step(iterator):
+
+      def step_fn(inputs):
+        images, targets = inputs
+        with backprop.GradientTape() as tape:
+          outputs = model(images)
+          loss = math_ops.reduce_sum(outputs - targets)
+        grads = tape.gradient(loss, model.variables)
+        return grads
+
+      outputs = distribution.experimental_run_v2(
+          step_fn, args=(next(iterator),))
+      return nest.map_structure(distribution.experimental_local_results,
+                                outputs)
+
+    train_step(input_iterator)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.tpu_strategies, mode=["eager"]))
+  def test_tf_function_experimental_compile(self, distribution):
+    dataset = self._get_dataset()
+    input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
+
+    class CustomDense(keras.layers.Layer):
+
+      def __init__(self, num_outputs):
+        super(CustomDense, self).__init__()
+        self.num_outputs = num_outputs
+
+      def build(self, input_shape):
+        self.kernel = self.add_variable(
+            "kernel", shape=[int(input_shape[-1]), self.num_outputs])
+
+      @def_function.function(experimental_compile=True)
+      def call(self, inputs):
+        return math_ops.matmul(inputs, self.kernel)
+
+    with distribution.scope():
+      x = keras.layers.Input(shape=(3,))
+      y = CustomDense(4)(x)
+      model = keras.Model(x, y)
+
     @def_function.function
     def train_step(iterator):
       def step_fn(inputs):
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index baf3b8295dc..74e9c600cee 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.tpu import tpu
 from tensorflow.python.training import saver
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
@@ -938,14 +939,14 @@ ops.register_tensor_conversion_function(Mirrored,
 
 
 def _enclosing_tpu_context():
-  """Returns the XLAControlFlowContext, which exists inside a tpu.rewrite()."""
+  """Returns the TPUReplicateContext, which exists inside a tpu.rewrite()."""
   graph = ops.get_default_graph()
   while graph is not None:
     # pylint: disable=protected-access
     context_ = graph._get_control_flow_context()
     # pylint: enable=protected-access
     while context_ is not None:
-      if isinstance(context_, control_flow_ops.XLAControlFlowContext):
+      if isinstance(context_, tpu.TPUReplicateContext):
         return context_
       context_ = context_.outer_context
     # This may be a FuncGraph due to defuns or v2 control flow. We need to
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 65d07846cea..7aef5da11f2 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -689,6 +689,7 @@ py_library(
         ":lift_to_graph",
         "//tensorflow/python:cond_v2",  # TODO(b/118513001): Imported via control_flow_ops; remove.
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:control_flow_util",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:util",
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index a2bcb91918b..76af2d32c3e 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import tf_logging as logging
@@ -563,9 +564,12 @@ class Function(object):
       return self._python_function(*args, **kwds)
 
     tracing_count = self._get_tracing_count()
-    if self._experimental_compile:
+    if self._experimental_compile and (
+        not control_flow_util.GraphOrParentsInXlaContext(
+            ops.get_default_graph())):
       # V2 control flow relies on XLAControlFlowContext to generate a
-      # XLA-compatible function graph.
+      # XLA-compatible function graph. If the function is already called inside
+      # an XLA context, we don't create nested XLA context.
       xla_context = control_flow_ops.XLAControlFlowContext()
       try:
         xla_context.Enter()

From 9c7537daae43a49ea154300f5b51246888b0cc53 Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Wed, 19 Feb 2020 16:37:06 -0800
Subject: [PATCH 296/442] Use io::JoinPath to build paths and avoid `\` when
 constructing file names.

JoinPath is being made to deal with different OS path separators. Defaulting to
`/` doesn't work in all cases. As an example, on Windows, when a `\` is in a
path, `/` is no longer considered a path separator which is why we want to avoid
it in filenames.

PiperOrigin-RevId: 296086151
Change-Id: Ib2dfef55e9e779ff5138f960dc462fce8a14833b
---
 .../compiler/mlir/tensorflow/utils/dump_mlir_util.cc      | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
index f06734a26bd..1b8ae8403bf 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
@@ -43,7 +43,8 @@ std::string MakeUniqueFilename(string name) {
   // Remove illegal characters from `name`.
   for (int i = 0; i < name.size(); ++i) {
     char ch = name[i];
-    if (ch == '/' || ch == '[' || ch == ']' || ch == '*' || ch == '?') {
+    if (ch == '/' || ch == '[' || ch == ']' || ch == '*' || ch == '?' ||
+        ch == '\\') {
       name[i] = '_';
     }
   }
@@ -123,10 +124,7 @@ Status CreateFileForDumping(llvm::StringRef name,
                  << "' directory for dumping: " << status;
     return Status(error::Code::UNAVAILABLE, "(unavailable)");
   }
-  *filepath = llvm::Twine(dir)
-                  .concat("/")
-                  .concat(MakeUniqueFilename(std::string(name)))
-                  .str();
+  *filepath = io::JoinPath(dir, MakeUniqueFilename(std::string(name)));
 
   // Try to open the file and generate a raw_ostream.
   std::unique_ptr<WritableFile> file;

From 7f48bded8a6f9d61857805cf194f74ae2beb72f3 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Wed, 19 Feb 2020 16:37:12 -0800
Subject: [PATCH 297/442] [XLA/GPU] Change rounding scheme for tree reduction
 to round up to nearest square

Previously, we were rounding up to the nearest divisor of the largest batch we
could handle without introducing atomics.
That leads to:
 - Very large padding, e.g. rounding up 8193 to 16384
 - Very small dimensions of extra reduction kernels, e.g. 2

Instead, this CL uses a more "even" rounding scheme, where we round up the
number to the nearest square. Nearest square is guaranteed to be within
2 * sqrt(N) of a number N, so required padding is fairly small even in the
worst case.

PiperOrigin-RevId: 296086172
Change-Id: I7bfa72b2309fd1e3c596d6e028a9468660f84879
---
 .../gpu/tests/tree_reduction_rewriter_test.cc | 134 ++++++++----------
 .../service/gpu/tree_reduction_rewriter.cc    |  47 +++---
 2 files changed, 84 insertions(+), 97 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/tests/tree_reduction_rewriter_test.cc b/tensorflow/compiler/xla/service/gpu/tests/tree_reduction_rewriter_test.cc
index c0210ff941d..eb821c36fae 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/tree_reduction_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/tree_reduction_rewriter_test.cc
@@ -67,24 +67,23 @@ ENTRY main {
   zero = f32[] constant(0)
   ROOT out = f32[] reduce(input, zero), dimensions={0}, to_apply=add
 }
-
 )";
 
   // TODO(cheshire): a more generic check, do not hardcode the names.
   MatchOptimizedHloWithShapes(hlo_text,
                               R"(
-// CHECK: %fused_computation (param_0.2: f32[50000]) -> f32[7] {
+// CHECK: %fused_computation (param_0.2: f32[50000]) -> f32[224] {
 // CHECK:   %param_0.2 = f32[50000]{0} parameter(0)
 // CHECK:   %zero_1 = f32[] constant(0)
-// CHECK:   %pad.1 = f32[57344]{0} pad(f32[50000]{0} %param_0.2, f32[] %zero_1), padding=0_7344
-// CHECK:   %bitcast.1 = f32[7,8192]{1,0} bitcast(f32[57344]{0} %pad.1)
-// CHECK:   ROOT %reduce.2 = f32[7]{0} reduce(f32[7,8192]{1,0} %bitcast.1, f32[] %zero_1), dimensions={1}, to_apply=%add
+// CHECK:   %pad.1 = f32[50176]{0} pad(f32[50000]{0} %param_0.2, f32[] %zero_1), padding=0_176
+// CHECK:   %bitcast.1 = f32[224,224]{1,0} bitcast(f32[50176]{0} %pad.1)
+// CHECK:   ROOT %reduce.2 = f32[224]{0} reduce(f32[224,224]{1,0} %bitcast.1, f32[] %zero_1), dimensions={1}, to_apply=%add
 // CHECK: }
 // CHECK: ENTRY %main (input: f32[50000]) -> f32[] {
 // CHECK:   %input = f32[50000]{0} parameter(0)
-// CHECK:   %fusion = f32[7]{0} fusion(f32[50000]{0} %input), kind=kInput, calls=%fused_computation
+// CHECK:   %fusion = f32[224]{0} fusion(f32[50000]{0} %input), kind=kInput, calls=%fused_computation
 // CHECK:   %zero = f32[] constant(0)
-// CHECK:   ROOT %reduce.1 = f32[] reduce(f32[7]{0} %fusion, f32[] %zero), dimensions={0}, to_apply=%add
+// CHECK:   ROOT %reduce.1 = f32[] reduce(f32[224]{0} %fusion, f32[] %zero), dimensions={0}, to_apply=%add
 // CHECK: }
       )");
 
@@ -107,27 +106,25 @@ ENTRY main {
   zero = f32[] constant(0)
   ROOT out = f32[100,100] reduce(input, zero), dimensions={2}, to_apply=add
 }
-
 )";
 
   EnsureDeterminism(hlo_text);
 
   MatchOptimizedHloWithShapes(hlo_text,
                               R"(
-// CHECK: %fused_computation (param_0.2: f32[100,100,10000]) -> f32[100,100,2] {
+// CHECK: %fused_computation (param_0.2: f32[100,100,10000]) -> f32[100,100,100] {
 // CHECK:   %param_0.2 = f32[100,100,10000]{2,1,0} parameter(0)
 // CHECK:   %zero_1 = f32[] constant(0)
-// CHECK:   %pad.1 = f32[100,100,16384]{2,1,0} pad(f32[100,100,10000]{2,1,0} %param_0.2, f32[] %zero_1), padding=0_0x0_0x0_6384
-// CHECK:   %bitcast.1 = f32[100,100,2,8192]{3,2,1,0} bitcast(f32[100,100,16384]{2,1,0} %pad.1)
-// CHECK:   ROOT %reduce.2 = f32[100,100,2]{2,1,0} reduce(f32[100,100,2,8192]{3,2,1,0} %bitcast.1, f32[] %zero_1), dimensions={3}, to_apply=%add
+// CHECK:   %pad.1 = f32[100,100,10000]{2,1,0} pad(f32[100,100,10000]{2,1,0} %param_0.2, f32[] %zero_1), padding=0_0x0_0x0_0
+// CHECK:   %bitcast.1 = f32[100,100,100,100]{3,2,1,0} bitcast(f32[100,100,10000]{2,1,0} %pad.1)
+// CHECK:   ROOT %reduce.2 = f32[100,100,100]{2,1,0} reduce(f32[100,100,100,100]{3,2,1,0} %bitcast.1, f32[] %zero_1), dimensions={3}, to_apply=%add
 // CHECK: }
 // CHECK: ENTRY %main (input: f32[100,100,10000]) -> f32[100,100] {
 // CHECK:   %input = f32[100,100,10000]{2,1,0} parameter(0)
-// CHECK:   %fusion = f32[100,100,2]{2,1,0} fusion(f32[100,100,10000]{2,1,0} %input), kind=kInput, calls=%fused_computation
+// CHECK:   %fusion = f32[100,100,100]{2,1,0} fusion(f32[100,100,10000]{2,1,0} %input), kind=kInput, calls=%fused_computation
 // CHECK:   %zero = f32[] constant(0)
-// CHECK:   ROOT %reduce.1 = f32[100,100]{1,0} reduce(f32[100,100,2]{2,1,0} %fusion, f32[] %zero), dimensions={2}, to_apply=%add
+// CHECK:   ROOT %reduce.1 = f32[100,100]{1,0} reduce(f32[100,100,100]{2,1,0} %fusion, f32[] %zero), dimensions={2}, to_apply=%add
 // CHECK: }
-
       )");
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
@@ -149,23 +146,22 @@ ENTRY main {
   zero = f32[] constant(0)
   ROOT out = f32[] reduce(input, zero), dimensions={0}, to_apply=add
 }
-
 )";
 
   MatchOptimizedHloWithShapes(hlo_text,
                               R"(
-// CHECK: %fused_computation (param_0.2: f32[1000000]) -> f32[123] {
+// CHECK: %fused_computation (param_0.2: f32[1000000]) -> f32[1000] {
 // CHECK:   %param_0.2 = f32[1000000]{0} parameter(0)
 // CHECK:   %zero_1 = f32[] constant(0)
-// CHECK:   %pad.1 = f32[1007616]{0} pad(f32[1000000]{0} %param_0.2, f32[] %zero_1), padding=0_7616
-// CHECK:   %bitcast.1 = f32[123,8192]{1,0} bitcast(f32[1007616]{0} %pad.1)
-// CHECK:   ROOT %reduce.2 = f32[123]{0} reduce(f32[123,8192]{1,0} %bitcast.1, f32[] %zero_1), dimensions={1}, to_apply=%add
+// CHECK:   %pad.1 = f32[1000000]{0} pad(f32[1000000]{0} %param_0.2, f32[] %zero_1), padding=0_0
+// CHECK:   %bitcast.1 = f32[1000,1000]{1,0} bitcast(f32[1000000]{0} %pad.1)
+// CHECK:   ROOT %reduce.2 = f32[1000]{0} reduce(f32[1000,1000]{1,0} %bitcast.1, f32[] %zero_1), dimensions={1}, to_apply=%add
 // CHECK: }
 // CHECK: ENTRY %main (input: f32[1000000]) -> f32[] {
 // CHECK:   %input = f32[1000000]{0} parameter(0)
-// CHECK:   %fusion = f32[123]{0} fusion(f32[1000000]{0} %input), kind=kInput, calls=%fused_computation
+// CHECK:   %fusion = f32[1000]{0} fusion(f32[1000000]{0} %input), kind=kInput, calls=%fused_computation
 // CHECK:   %zero = f32[] constant(0)
-// CHECK:   ROOT %reduce.1 = f32[] reduce(f32[123]{0} %fusion, f32[] %zero), dimensions={0}, to_apply=%add
+// CHECK:   ROOT %reduce.1 = f32[] reduce(f32[1000]{0} %fusion, f32[] %zero), dimensions={0}, to_apply=%add
 // CHECK: }
       )");
 
@@ -188,25 +184,24 @@ ENTRY main {
   zero = f32[] constant(0)
   ROOT out = f32[100] reduce(input, zero), dimensions={0,2}, to_apply=add
 }
-
 )";
 
   EnsureDeterminism(hlo_text);
 
   MatchOptimizedHloWithShapes(hlo_text,
                               R"(
-// CHECK: %fused_computation (param_0.2: f32[8,100,10000]) -> f32[100,2] {
+// CHECK: %fused_computation (param_0.2: f32[8,100,10000]) -> f32[100,100] {
 // CHECK:   %param_0.2 = f32[8,100,10000]{2,1,0} parameter(0)
 // CHECK:   %zero_1 = f32[] constant(0)
-// CHECK:   %pad.1 = f32[8,100,16384]{2,1,0} pad(f32[8,100,10000]{2,1,0} %param_0.2, f32[] %zero_1), padding=0_0x0_0x0_6384
-// CHECK:   %bitcast.1 = f32[8,100,2,8192]{3,2,1,0} bitcast(f32[8,100,16384]{2,1,0} %pad.1)
-// CHECK:   ROOT %reduce.2 = f32[100,2]{1,0} reduce(f32[8,100,2,8192]{3,2,1,0} %bitcast.1, f32[] %zero_1), dimensions={3,0}, to_apply=%add
+// CHECK:   %pad.1 = f32[8,100,10000]{2,1,0} pad(f32[8,100,10000]{2,1,0} %param_0.2, f32[] %zero_1), padding=0_0x0_0x0_0
+// CHECK:   %bitcast.1 = f32[8,100,100,100]{3,2,1,0} bitcast(f32[8,100,10000]{2,1,0} %pad.1)
+// CHECK:   ROOT %reduce.2 = f32[100,100]{1,0} reduce(f32[8,100,100,100]{3,2,1,0} %bitcast.1, f32[] %zero_1), dimensions={3,0}, to_apply=%add
 // CHECK: }
 // CHECK: ENTRY %main (input: f32[8,100,10000]) -> f32[100] {
 // CHECK:   %input = f32[8,100,10000]{2,1,0} parameter(0)
-// CHECK:   %fusion = f32[100,2]{1,0} fusion(f32[8,100,10000]{2,1,0} %input), kind=kInput, calls=%fused_computation
+// CHECK:   %fusion = f32[100,100]{1,0} fusion(f32[8,100,10000]{2,1,0} %input), kind=kInput, calls=%fused_computation
 // CHECK:   %zero = f32[] constant(0)
-// CHECK:   ROOT %reduce.1 = f32[100]{0} reduce(f32[100,2]{1,0} %fusion, f32[] %zero), dimensions={1}, to_apply=%add
+// CHECK:   ROOT %reduce.1 = f32[100]{0} reduce(f32[100,100]{1,0} %fusion, f32[] %zero), dimensions={1}, to_apply=%add
 // CHECK: }
       )");
 
@@ -234,23 +229,19 @@ ENTRY main {
 
   MatchOptimizedHloWithShapes(hlo_text,
                               R"(
-// CHECK: %fused_computation (param_0.4: f32[32,100,2]) -> f32[100] {
-// CHECK:   %param_0.4 = f32[32,100,2]{2,1,0} parameter(0)
+// CHECK: %fused_computation (param_0.2: f32[32,100,10000]) -> f32[32,100,100] {
+// CHECK:   %param_0.2 = f32[32,100,10000]{2,1,0} parameter(0)
 // CHECK:   %zero_1 = f32[] constant(0)
-// CHECK:   %reduce.5 = f32[32,100]{1,0} reduce(f32[32,100,2]{2,1,0} %param_0.4, f32[] %zero_1), dimensions={2}, to_apply=%add
-// CHECK:   ROOT %reduce.4 = f32[100]{0} reduce(f32[32,100]{1,0} %reduce.5, f32[] %zero_1), dimensions={0}, to_apply=%add
-// CHECK: }
-// CHECK: %fused_computation.1 (param_0.5: f32[32,100,10000]) -> f32[32,100,2] {
-// CHECK:   %param_0.5 = f32[32,100,10000]{2,1,0} parameter(0)
-// CHECK:   %zero_2 = f32[] constant(0)
-// CHECK:   %pad.1 = f32[32,100,16384]{2,1,0} pad(f32[32,100,10000]{2,1,0} %param_0.5, f32[] %zero_2), padding=0_0x0_0x0_6384
-// CHECK:   %bitcast.1 = f32[32,100,2,8192]{3,2,1,0} bitcast(f32[32,100,16384]{2,1,0} %pad.1)
-// CHECK:   ROOT %reduce.6 = f32[32,100,2]{2,1,0} reduce(f32[32,100,2,8192]{3,2,1,0} %bitcast.1, f32[] %zero_2), dimensions={3}, to_apply=%add
+// CHECK:   %pad.1 = f32[32,100,10000]{2,1,0} pad(f32[32,100,10000]{2,1,0} %param_0.2, f32[] %zero_1), padding=0_0x0_0x0_0
+// CHECK:   %bitcast.1 = f32[32,100,100,100]{3,2,1,0} bitcast(f32[32,100,10000]{2,1,0} %pad.1)
+// CHECK:   ROOT %reduce.4 = f32[32,100,100]{2,1,0} reduce(f32[32,100,100,100]{3,2,1,0} %bitcast.1, f32[] %zero_1), dimensions={3}, to_apply=%add
 // CHECK: }
 // CHECK: ENTRY %main (input: f32[32,100,10000]) -> f32[100] {
 // CHECK:   %input = f32[32,100,10000]{2,1,0} parameter(0)
-// CHECK:   %fusion.1 = f32[32,100,2]{2,1,0} fusion(f32[32,100,10000]{2,1,0} %input), kind=kInput, calls=%fused_computation.1
-// CHECK:   ROOT %fusion = f32[100]{0} fusion(f32[32,100,2]{2,1,0} %fusion.1), kind=kInput, calls=%fused_computation
+// CHECK:   %fusion = f32[32,100,100]{2,1,0} fusion(f32[32,100,10000]{2,1,0} %input), kind=kInput, calls=%fused_computation
+// CHECK:   %zero = f32[] constant(0)
+// CHECK:   %reduce.3 = f32[32,100]{1,0} reduce(f32[32,100,100]{2,1,0} %fusion, f32[] %zero), dimensions={2}, to_apply=%add
+// CHECK:   ROOT %reduce.1 = f32[100]{0} reduce(f32[32,100]{1,0} %reduce.3, f32[] %zero), dimensions={0}, to_apply=%add
 // CHECK: }
       )");
 
@@ -274,22 +265,22 @@ ENTRY main {
   zero = f32[] constant(0)
   ROOT out = f32[100] reduce(input, zero), dimensions={0}, to_apply=add
 }
-
 )";
 
   MatchOptimizedHloWithShapes(hlo_text,
                               R"(
-// CHECK: %fused_computation (param_0.2: f32[10000,100]) -> f32[100] {
-// CHECK:  %param_0.2 = f32[10000,100]{1,0} parameter(0)
-// CHECK:  %zero_1 = f32[] constant(0)
-// CHECK:  %pad.1 = f32[12288,100]{1,0} pad(f32[10000,100]{1,0} %param_0.2, f32[] %zero_1), padding=0_2288x0_0
-// CHECK:  %bitcast.1 = f32[3,4096,100]{2,1,0} bitcast(f32[12288,100]{1,0} %pad.1)
-// CHECK:  %reduce.3 = f32[4096,100]{1,0} reduce(f32[3,4096,100]{2,1,0} %bitcast.1, f32[] %zero_1), dimensions={0}, to_apply=%add
-// CHECK:  ROOT %reduce.2 = f32[100]{0} reduce(f32[4096,100]{1,0} %reduce.3, f32[] %zero_1), dimensions={0}, to_apply=%add
+// CHECK: %fused_computation (param_0.2: f32[10000,100]) -> f32[100,100] {
+// CHECK:   %param_0.2 = f32[10000,100]{1,0} parameter(0)
+// CHECK:   %zero_1 = f32[] constant(0)
+// CHECK:   %pad.1 = f32[10000,100]{1,0} pad(f32[10000,100]{1,0} %param_0.2, f32[] %zero_1), padding=0_0x0_0
+// CHECK:   %bitcast.1 = f32[100,100,100]{2,1,0} bitcast(f32[10000,100]{1,0} %pad.1)
+// CHECK:   ROOT %reduce.2 = f32[100,100]{1,0} reduce(f32[100,100,100]{2,1,0} %bitcast.1, f32[] %zero_1), dimensions={0}, to_apply=%add
 // CHECK: }
 // CHECK: ENTRY %main (input: f32[10000,100]) -> f32[100] {
-// CHECK:  %input = f32[10000,100]{1,0} parameter(0)
-// CHECK:  ROOT %fusion = f32[100]{0} fusion(f32[10000,100]{1,0} %input), kind=kInput, calls=%fused_computation
+// CHECK:   %input = f32[10000,100]{1,0} parameter(0)
+// CHECK:   %fusion = f32[100,100]{1,0} fusion(f32[10000,100]{1,0} %input), kind=kInput, calls=%fused_computation
+// CHECK:   %zero = f32[] constant(0)
+// CHECK:   ROOT %reduce.1 = f32[100]{0} reduce(f32[100,100]{1,0} %fusion, f32[] %zero), dimensions={0}, to_apply=%add
 // CHECK: }
       )");
 
@@ -316,17 +307,18 @@ ENTRY main {
 
   MatchOptimizedHloWithShapes(hlo_text,
                               R"(
-// CHECK: %fused_computation (param_0.2: f32[10000,2,2,2]) -> f32[2,2,2] {
-// CHECK:  %param_0.2 = f32[10000,2,2,2]{3,2,1,0} parameter(0)
-// CHECK:  %zero_1 = f32[] constant(0)
-// CHECK:  %pad.1 = f32[12288,2,2,2]{3,2,1,0} pad(f32[10000,2,2,2]{3,2,1,0} %param_0.2, f32[] %zero_1), padding=0_2288x0_0x0_0x0_0
-// CHECK:  %bitcast.1 = f32[3,4096,2,2,2]{4,3,2,1,0} bitcast(f32[12288,2,2,2]{3,2,1,0} %pad.1)
-// CHECK:  %reduce.3 = f32[4096,2,2,2]{3,2,1,0} reduce(f32[3,4096,2,2,2]{4,3,2,1,0} %bitcast.1, f32[] %zero_1), dimensions={0}, to_apply=%add
-// CHECK:  ROOT %reduce.2 = f32[2,2,2]{2,1,0} reduce(f32[4096,2,2,2]{3,2,1,0} %reduce.3, f32[] %zero_1), dimensions={0}, to_apply=%add
+// CHECK: %fused_computation (param_0.2: f32[10000,2,2,2]) -> f32[100,2,2,2] {
+// CHECK:   %param_0.2 = f32[10000,2,2,2]{3,2,1,0} parameter(0)
+// CHECK:   %zero_1 = f32[] constant(0)
+// CHECK:   %pad.1 = f32[10000,2,2,2]{3,2,1,0} pad(f32[10000,2,2,2]{3,2,1,0} %param_0.2, f32[] %zero_1), padding=0_0x0_0x0_0x0_0
+// CHECK:   %bitcast.1 = f32[100,100,2,2,2]{4,3,2,1,0} bitcast(f32[10000,2,2,2]{3,2,1,0} %pad.1)
+// CHECK:   ROOT %reduce.2 = f32[100,2,2,2]{3,2,1,0} reduce(f32[100,100,2,2,2]{4,3,2,1,0} %bitcast.1, f32[] %zero_1), dimensions={0}, to_apply=%add
 // CHECK: }
 // CHECK: ENTRY %main (input: f32[10000,2,2,2]) -> f32[2,2,2] {
-// CHECK:  %input = f32[10000,2,2,2]{3,2,1,0} parameter(0)
-// CHECK:  ROOT %fusion = f32[2,2,2]{2,1,0} fusion(f32[10000,2,2,2]{3,2,1,0} %input), kind=kInput, calls=%fused_computation
+// CHECK:   %input = f32[10000,2,2,2]{3,2,1,0} parameter(0)
+// CHECK:   %fusion = f32[100,2,2,2]{3,2,1,0} fusion(f32[10000,2,2,2]{3,2,1,0} %input), kind=kInput, calls=%fused_computation
+// CHECK:   %zero = f32[] constant(0)
+// CHECK:   ROOT %reduce.1 = f32[2,2,2]{2,1,0} reduce(f32[100,2,2,2]{3,2,1,0} %fusion, f32[] %zero), dimensions={0}, to_apply=%add
 // CHECK: }
       )");
 
@@ -355,18 +347,18 @@ ENTRY main {
 
   MatchOptimizedHloWithShapes(hlo_text,
                               R"(
-// CHECK: %fused_computation (param_0.2: f32[1000000,5]) -> f32[4096,5] {
-// CHECK:  %param_0.2 = f32[1000000,5]{1,0} parameter(0)
-// CHECK:  %zero_1 = f32[] constant(0)
-// CHECK:  %pad.1 = f32[1003520,5]{1,0} pad(f32[1000000,5]{1,0} %param_0.2, f32[] %zero_1), padding=0_3520x0_0
-// CHECK:  %bitcast.1 = f32[245,4096,5]{2,1,0} bitcast(f32[1003520,5]{1,0} %pad.1)
-// CHECK:  ROOT %reduce.2 = f32[4096,5]{1,0} reduce(f32[245,4096,5]{2,1,0} %bitcast.1, f32[] %zero_1), dimensions={0}, to_apply=%add
+// CHECK: %fused_computation (param_0.2: f32[1000000,5]) -> f32[1000,5] {
+// CHECK:   %param_0.2 = f32[1000000,5]{1,0} parameter(0)
+// CHECK:   %zero_1 = f32[] constant(0)
+// CHECK:   %pad.1 = f32[1000000,5]{1,0} pad(f32[1000000,5]{1,0} %param_0.2, f32[] %zero_1), padding=0_0x0_0
+// CHECK:   %bitcast.1 = f32[1000,1000,5]{2,1,0} bitcast(f32[1000000,5]{1,0} %pad.1)
+// CHECK:   ROOT %reduce.2 = f32[1000,5]{1,0} reduce(f32[1000,1000,5]{2,1,0} %bitcast.1, f32[] %zero_1), dimensions={0}, to_apply=%add
 // CHECK: }
 // CHECK: ENTRY %main (input: f32[1000000,5]) -> f32[5] {
-// CHECK:  %input = f32[1000000,5]{1,0} parameter(0)
-// CHECK:  %fusion = f32[4096,5]{1,0} fusion(f32[1000000,5]{1,0} %input), kind=kInput, calls=%fused_computation
-// CHECK:  %zero = f32[] constant(0)
-// CHECK:  ROOT %reduce.1 = f32[5]{0} reduce(f32[4096,5]{1,0} %fusion, f32[] %zero), dimensions={0}, to_apply=%add
+// CHECK:   %input = f32[1000000,5]{1,0} parameter(0)
+// CHECK:   %fusion = f32[1000,5]{1,0} fusion(f32[1000000,5]{1,0} %input), kind=kInput, calls=%fused_computation
+// CHECK:   %zero = f32[] constant(0)
+// CHECK:   ROOT %reduce.1 = f32[5]{0} reduce(f32[1000,5]{1,0} %fusion, f32[] %zero), dimensions={0}, to_apply=%add
 // CHECK: }
       )");
 
diff --git a/tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.cc b/tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.cc
index 5dad97dab39..e6d4569478c 100644
--- a/tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.cc
@@ -46,6 +46,11 @@ static constexpr int64 kColumnAtomicFreeBound = kWarpSize * 128;
 // decreased column/row tiling.
 static constexpr int64 kBatchedAtomicFreeBound = 8;
 
+// Returns the square root of the input rounded up to the nearest square.
+static int64 SqrtOfRoundUpToNearestSquare(int64 input) {
+  return static_cast<int64>(std::ceil(std::sqrt(input)));
+}
+
 class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
  public:
   explicit ReductionRewriterVisitor() {}
@@ -105,39 +110,29 @@ class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
 
     int64 reduced_dim_size = input_shape.dimensions(reduced_input_dimension);
     VLOG(3) << "reduced_dim_size = " << reduced_dim_size;
-    // TODO(cheshire): if atomic_free_bound is large, num_fit is likely to be
-    // small. Generating a reduction with very small reduced dimension is not
-    // efficient, it would be better to split the dimension sizes more evenly.
-    //
-    // One possible idea is to pad to a nearest square (ceil(sqrt(x)))^2.
-    // Given that:
+
+    // We pad to a nearest square (ceil(sqrt(x)))^2.  Given that:
     //
     // (n + 1)^2 = n^2 + (2n+1)
     //
     // it can be seen that the distance to the nearest square is at most twice
     // the square root of the input number.
-    int64 num_fit = CeilOfRatio(reduced_dim_size, atomic_free_bound);
+    int64 num_fit = SqrtOfRoundUpToNearestSquare(reduced_dim_size);
 
     // Pad reduced dimension to the required number of elements.
     HloInstruction *padded = [&] {
-      // TODO(cheshire): if atomic_free_bound is very large, padding all the way
-      // up to to atomic_free_bound is wasteful, we could pad to a much smaller
-      // value.
-      if (reduced_dim_size % atomic_free_bound != 0) {
-        int64 padded_num_elements = num_fit * atomic_free_bound;
-        PaddingConfig padding_config = MakeNoPaddingConfig(input_shape.rank());
-        padding_config.mutable_dimensions(reduced_input_dimension)
-            ->set_edge_padding_high(padded_num_elements - reduced_dim_size);
-        std::vector<int64> padded_dimensions(input_shape.dimensions().begin(),
-                                             input_shape.dimensions().end());
-        padded_dimensions[reduced_input_dimension] = padded_num_elements;
-        Shape padded_shape =
-            ShapeUtil::MakeShape(input_shape.element_type(), padded_dimensions);
-        VLOG(3) << "Generated padded shape: " << padded_shape.ToString();
-        return hlo->parent()->AddInstruction(HloInstruction::CreatePad(
-            padded_shape, input, initial_value, padding_config));
-      }
-      return input;
+      int64 padded_num_elements = num_fit * num_fit;
+      PaddingConfig padding_config = MakeNoPaddingConfig(input_shape.rank());
+      padding_config.mutable_dimensions(reduced_input_dimension)
+          ->set_edge_padding_high(padded_num_elements - reduced_dim_size);
+      std::vector<int64> padded_dimensions(input_shape.dimensions().begin(),
+                                           input_shape.dimensions().end());
+      padded_dimensions[reduced_input_dimension] = padded_num_elements;
+      Shape padded_shape =
+          ShapeUtil::MakeShape(input_shape.element_type(), padded_dimensions);
+      VLOG(3) << "Generated padded shape: " << padded_shape.ToString();
+      return hlo->parent()->AddInstruction(HloInstruction::CreatePad(
+          padded_shape, input, initial_value, padding_config));
     }();
 
     VLOG(1) << "Generated padding: " << padded->ToString();
@@ -146,7 +141,7 @@ class ReductionRewriterVisitor : public DfsHloRewriteVisitor {
          dim_idx++) {
       if (dim_idx == reduced_input_dimension) {
         reshaped_dimensions.push_back(num_fit);
-        reshaped_dimensions.push_back(atomic_free_bound);
+        reshaped_dimensions.push_back(num_fit);
       } else {
         reshaped_dimensions.push_back(padded->shape().dimensions(dim_idx));
       }

From 13cca52d62148fb5e103c1265c95184b75f577f5 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Wed, 19 Feb 2020 16:38:24 -0800
Subject: [PATCH 298/442] Remove useless self-link and python version note.

Fixes #26645

PiperOrigin-RevId: 296086404
Change-Id: Ice56c6032290939e89fd752d584baab3d320c689
---
 tensorflow/python/platform/test.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py
index a2fafed3bed..a8cde30ab16 100644
--- a/tensorflow/python/platform/test.py
+++ b/tensorflow/python/platform/test.py
@@ -13,13 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Testing.
-
-See the [Testing](https://tensorflow.org/api_docs/python/tf/test) guide.
-
-Note: `tf.compat.v1.test.mock` is an alias to the python `mock` or
-`unittest.mock` depending on the python version.
-"""
+"""Testing."""
 
 from __future__ import absolute_import
 from __future__ import division

From 2fb3d8ba6a8fc6d2ccb01c5764fb6e60f47cb69b Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Wed, 19 Feb 2020 16:48:10 -0800
Subject: [PATCH 299/442] Automated rollback of commit
 e623eb0f9c1c65705f0cfb1c6cb1d8cb2649cdbb

PiperOrigin-RevId: 296088251
Change-Id: If0555f6a1f01eb03fda7dc33377bfdb317843740
---
 tensorflow/python/distribute/BUILD            |   3 +-
 .../python/distribute/cross_device_ops.py     |  94 ++++++-
 .../python/distribute/cross_device_utils.py   | 229 ++++--------------
 3 files changed, 125 insertions(+), 201 deletions(-)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index a4e2795ce2e..1ccb21cea17 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -790,7 +790,8 @@ cuda_py_test(
     name = "cross_device_ops_test",
     srcs = ["cross_device_ops_test.py"],
     tags = [
-        "multi_and_single_gpu",
+        # TODO(b/138143527): Re-enable after fixing Guitar failure.
+        # "multi_and_single_gpu",
     ],
     deps = [
         ":collective_all_reduce_strategy",
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index 3b5dff9a6f8..7f6230e9404 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -34,6 +34,7 @@ from tensorflow.python.framework import kernels
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import tf_logging as logging
@@ -1150,7 +1151,7 @@ class CollectiveAllReduce(CrossDeviceOps):
                 reduced_gv_list):
               control_input_grads = [g for g, _ in reduced_gv_list[-1]]
             else:
-              control_input_grads = None
+              control_input_grads = []
             collective_reduced = cross_device_utils.build_collective_reduce(
                 grads, self._num_workers, self._collective_keys, "Add", "Id",
                 communication_hint, control_input_grads)
@@ -1199,20 +1200,87 @@ class CollectiveAllReduce(CrossDeviceOps):
       # optimizer and packed into a single all-reduce.
       with ops.name_scope("allreduce"):
         for grad_and_vars in chunk:
-          grads = [g for g, _ in grad_and_vars]
+          # `grad_and_vars` contains gradients for the same variable but from
+          # different devices. Because current CollectiveAllGather
+          # implementations require input IndexedSlices to have consistent
+          # length across the board, we handle the reduction of IndexedSlices
+          # as follows:
+          #   1. Gather the lengths of IndexedSlices from all participants.
+          #   2. If they have consistent length, apply all_gather.
+          #   3. Otherwise convert IndexedSlices to dense tensors and apply
+          #      all_reduce.
 
-          # Add control dependencies per device from the last gradients to the
-          # current set, in order to serialize NCCL launches.
-          if (communication_hint == CollectiveCommunication.NCCL.value and
-              reduced_gv_list):
-            control_input_grads = [g for g, _ in reduced_gv_list[-1]]
-          else:
-            control_input_grads = None
+          def all_gather():
+            """Use all_gather to aggregate `IndexedSlices`."""
+            grads = [g for g, _ in grad_and_vars]  # pylint: disable=cell-var-from-loop
+            values = [g.values for g in grads]
+            indices = [g.indices for g in grads]
+
+            # Build two separate allgathers, one for values, the other one for
+            # indices.
+            gathered_values = cross_device_utils.build_collective_gather(
+                values, self._num_workers, self._collective_keys)
+            gathered_indices = cross_device_utils.build_collective_gather(
+                indices, self._num_workers, self._collective_keys)
+            assert len(gathered_values) == len(gathered_indices)
+
+            gathered_grads = []
+            for i in range(len(values)):
+              gathered_grad = ops.IndexedSlices(
+                  values=gathered_values[i],
+                  indices=gathered_indices[i],
+                  dense_shape=grads[i].dense_shape)
+              gathered_grads.append(gathered_grad)
+            return gathered_grads
+
+          def all_reduce():
+            """Use all_reduce to aggregate `IndexedSlices`."""
+            grads = []
+            for g, _ in grad_and_vars:  # pylint: disable=cell-var-from-loop
+              with ops.device(g.device):
+                grads.append(ops.convert_to_tensor(g))
+
+            reduced_dense_grads = cross_device_utils.build_collective_reduce(
+                grads, self._num_workers, self._collective_keys, "Add", "Id",
+                communication_hint)
+            # We have to convert dense grad to IndexedSlice because all_reduce()
+            # and all_gather() must have the same return type as required by
+            # control_flow_ops.cond.
+            reduced_grads = []
+            for grad in reduced_dense_grads:
+              reduced_grads.append(
+                  ops.IndexedSlices(
+                      values=grad,
+                      indices=math_ops.range(array_ops.shape(grad)[0]),
+                      dense_shape=array_ops.shape(grad)))
+            return reduced_grads
+
+          indexed_slice_lengths = []
+          for g, _ in grad_and_vars:
+            with ops.device(g.device):
+              indexed_slice_lengths.append(array_ops.shape(g.indices))
+          gathered_indexed_slice_lengths = (
+              cross_device_utils.build_collective_gather(
+                  indexed_slice_lengths, self._num_workers,
+                  self._collective_keys))
+          # gathered_indexed_slice_lengths takes the following forms:
+          # [[length1_on_gpu_0, length2_on_gpu0, ...],
+          #  [length1_on_gpu_1, length2_on_gpu1, ...]
+          #  ...
+          # ]
+          # Each sublist is value-wise identical but resides on different
+          # devices. Since each sublist has the same value, we can just use the
+          # first sublist to compute the condition.
+          collective_reduced = control_flow_ops.cond(
+              math_ops.equal(
+                  math_ops.reduce_max(gathered_indexed_slice_lengths[0]),
+                  math_ops.reduce_min(gathered_indexed_slice_lengths[0])),
+              all_gather, all_reduce)
+          # tf.cond implicitly unpacks singleton list to single value, hence
+          # we need to re-wrap the single value into a singleton list here.
+          if not isinstance(collective_reduced, list):
+            collective_reduced = [collective_reduced]
 
-          collective_reduced = (
-              cross_device_utils.build_collective_gather_indexed_slices(
-                  grads, self._num_workers, self._collective_keys,
-                  communication_hint, control_input_grads))
           result = []
           for (_, v), g in zip(grad_and_vars, collective_reduced):
             result.append([g, v])
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
index 0b88bdc9067..3afb8b55b24 100644
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -25,12 +25,12 @@ from tensorflow.python.distribute import all_reduce
 from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nccl_ops
 
@@ -304,19 +304,6 @@ class CollectiveKeys(object):
         self._group_key_table[key_id] = new_key
     return self._group_key_table[key_id]
 
-  def get_group_key_of_tensors(self, tensors):
-    """Returns a group key for set of tensors.
-
-    Args:
-      tensors: list of `Tensor`s in a collective group. Each tensor must be on a
-        different device.
-
-    Returns:
-      int key uniquely identifying the set of devices of these tensors.
-    """
-    devices = [t.device for t in tensors]
-    return self.get_group_key(devices)
-
   def get_op_instance_key(self):
     """Returns a new instance key for use in defining a collective op."""
     v = self._get_thread_local_object().op_instance_key
@@ -335,12 +322,10 @@ def build_collective_reduce(input_tensors,
                             collective_keys,
                             reduction_op='Add',
                             unary_op='Id',
-                            communication_hint='AUTO',
+                            communication_hint='auto',
                             control_inputs=None):
   """Build a subgraph that does one full all-reduce, using the collective Op.
 
-  This method must be called in graph mode or inside a tf.function.
-
   Args:
     input_tensors: tensors within a single worker graph that are to be reduced
       together; must be one per device.
@@ -361,40 +346,37 @@ def build_collective_reduce(input_tensors,
   Raises:
     ValueError: There must be at least two tensors over all the workers.
   """
-  assert not context.executing_eagerly(), (
-      'build_collective_reduce can only be called in graph mode or inside '
-      'tf.function')
-
   group_size = len(input_tensors) * num_workers
   if group_size < 2:
     return input_tensors
-  group_key = collective_keys.get_group_key_of_tensors(input_tensors)
+  devices = [t.device for t in input_tensors]
+  num_devices = len(devices)
+  group_key = collective_keys.get_group_key(devices)
   instance_key = collective_keys.get_op_instance_key()
   subdiv_offsets = [0]  # TODO(tucker): maybe support non-default subdiv spec
+  if control_inputs:
+    assert len(control_inputs) == len(input_tensors)
 
   out_tensors = []
-  for idx, input_tensor in enumerate(input_tensors):
-    with ops.device(input_tensor.device):
-      with ops.control_dependencies(
-          _control_input(input_tensors, control_inputs, idx)):
-        out_tensor = collective_ops.all_reduce(input_tensor, group_size,
-                                               group_key, instance_key,
-                                               reduction_op, unary_op,
-                                               subdiv_offsets,
-                                               communication_hint)
-      out_tensors.append(out_tensor)
+  for dev_idx in range(num_devices):
+    with ops.device(devices[dev_idx]):
+      if control_inputs:
+        assert control_inputs[dev_idx].device == input_tensors[dev_idx].device
+        with ops.control_dependencies([control_inputs[dev_idx]]):
+          reduce_op = collective_ops.all_reduce(
+              input_tensors[dev_idx], group_size, group_key, instance_key,
+              reduction_op, unary_op, subdiv_offsets, communication_hint)
+      else:
+        reduce_op = collective_ops.all_reduce(
+            input_tensors[dev_idx], group_size, group_key, instance_key,
+            reduction_op, unary_op, subdiv_offsets, communication_hint)
+      out_tensors.append(reduce_op)
   return out_tensors
 
 
-def build_collective_gather(input_tensors,
-                            num_workers,
-                            collective_keys,
-                            communication_hint='AUTO',
-                            control_inputs=None):
+def build_collective_gather(input_tensors, num_workers, collective_keys):
   """Build a subgraph that does one full all-gather, using the collective Op.
 
-  This method must be called in graph mode or inside a tf.function.
-
   Args:
     input_tensors: tensors within a single worker graph that are to be gathered
       together; must be one per device.
@@ -402,136 +384,37 @@ def build_collective_gather(input_tensors,
       will be doing this same reduction.  The reduction will actually include
       the corresponding tensors at all these workers.
     collective_keys: a CollectiveKeys object.
-    communication_hint: string providing hint to runtime for choosing collective
-      implementation.
-    control_inputs: if not None, add control edges between control_inputs and
-      (index-wise) corresponding collective_gather tensors
 
   Returns:
     An array of final tensors, one per device, computed by the full gather.
-  """
-  assert not context.executing_eagerly(), (
-      'build_collective_gather can only be called in graph mode or inside '
-      'tf.function')
 
+  Raises:
+    ValueError: There must be at least two tensors over all the workers.
+  """
   group_size = len(input_tensors) * num_workers
   if group_size < 2:
     return input_tensors
-  group_key = collective_keys.get_group_key_of_tensors(input_tensors)
+  devices = [t.device for t in input_tensors]
+  num_devices = len(devices)
+  group_key = collective_keys.get_group_key(devices)
   instance_key = collective_keys.get_op_instance_key()
 
-  out_tensors = []
-  for idx, input_tensor in enumerate(input_tensors):
-    with ops.device(input_tensor.device):
-      with ops.control_dependencies(
-          _control_input(input_tensors, control_inputs, idx)):
-        out_tensor = collective_ops.all_gather(input_tensor, group_size,
-                                               group_key, instance_key,
-                                               communication_hint)
-      out_tensors.append(out_tensor)
-  return out_tensors
+  def collective_all_gather():
+    """Call collective allgather."""
+    assert not context.executing_eagerly()
+    out_tensors = []
+    for d in range(num_devices):
+      with ops.device(devices[d]):
+        gather_op = collective_ops.all_gather(input_tensors[d], group_size,
+                                              group_key, instance_key)
+        out_tensors.append(gather_op)
+    return out_tensors
 
-
-def build_collective_gather_indexed_slices(input_slices_list,
-                                           num_workers,
-                                           collective_keys,
-                                           communication_hint='AUTO',
-                                           control_inputs=None):
-  """Build a subgraph that all-gathers IndexedSlices using the collective Op.
-
-  This method must be called in graph mode or inside a tf.function.
-
-  Args:
-    input_slices_list: a list of IndexedSlices within a single worker graph that
-      are to be gathered together; must be one per device.
-    num_workers: total number of workers with identical independent graphs that
-      will be doing this same reduction.  The reduction will actually include
-      the corresponding tensors at all these workers.
-    collective_keys: a CollectiveKeys object.
-    communication_hint: string providing hint to runtime for choosing collective
-      implementation.
-    control_inputs: if not None, add control edges between control_inputs and
-      (index-wise) corresponding collective_reduce tensors
-
-  Returns:
-    An array of final IndexedSlices, one per device, computed by the full
-    gather.
-
-  Raises:
-    ValueError: if control_inputs is not None and doesn't match the length and
-      devices of inputs.
-  """
-  assert not context.executing_eagerly(), (
-      'build_collective_gather_indexed_slices can only be called in graph mode'
-      ' or inside tf.function')
-
-  group_size = len(input_slices_list) * num_workers
-  if group_size < 2:
-    return input_slices_list
-
-  group_key = collective_keys.get_group_key_of_tensors(input_slices_list)
-  gather_length_key = collective_keys.get_op_instance_key()
-  gather_indices_key = collective_keys.get_op_instance_key()
-  gather_values_key = collective_keys.get_op_instance_key()
-  reduce_densified_key = collective_keys.get_op_instance_key()
-
-  # Current CollectiveAllGather implementations require input IndexedSlices to
-  # have consistent length across the board, we handle the reduction of
-  # IndexedSlices as follows:
-  #   1. Gather the lengths of IndexedSlices from all participants.
-  #   2. If they have consistent length, apply all_gather.
-  #   3. Otherwise convert IndexedSlices to dense tensors and apply
-  #      all_reduce.
-  out_slices_list = []
-  for idx, input_slices in enumerate(input_slices_list):
-    # pylint: disable = cell-var-from-loop
-    with ops.device(input_slices.device):
-
-      def all_gather():
-        """Use all_gather to aggregate `IndexedSlices`."""
-        all_values = collective_ops.all_gather(input_slices.values, group_size,
-                                               group_key, gather_values_key,
-                                               communication_hint)
-        # Add control dependency to order the all-gather.
-        control = [all_values] if communication_hint == 'NCCL' else []
-        with ops.control_dependencies(control):
-          all_indices = collective_ops.all_gather(input_slices.indices,
-                                                  group_size, group_key,
-                                                  gather_indices_key,
-                                                  communication_hint)
-        return ops.IndexedSlices(
-            values=all_values,
-            indices=all_indices,
-            dense_shape=input_slices.dense_shape)
-
-      def densify_and_all_reduce():
-        """Use all_reduce to aggregate `IndexedSlices`."""
-        densified = ops.convert_to_tensor(input_slices)
-        reduced = collective_ops.all_reduce(densified, group_size, group_key,
-                                            reduce_densified_key, 'Add', 'Id',
-                                            [0], communication_hint)
-        # We have to convert dense grad to IndexedSlice because all_reduce()
-        # and all_gather() must have the same return type as required by
-        # control_flow_ops.cond.
-        return ops.IndexedSlices(
-            values=reduced,
-            indices=math_ops.range(array_ops.shape(reduced)[0]),
-            dense_shape=input_slices.dense_shape)
-
-      length = array_ops.shape(input_slices.indices)
-      with ops.control_dependencies(
-          _control_input(input_slices, control_inputs, idx)):
-        all_lengths = collective_ops.all_gather(length, group_size, group_key,
-                                                gather_length_key,
-                                                communication_hint)
-      out_slices = control_flow_ops.cond(
-          math_ops.equal(
-              math_ops.reduce_max(all_lengths),
-              math_ops.reduce_min(all_lengths)), all_gather,
-          densify_and_all_reduce)
-      out_slices_list.append(out_slices)
-    # pylint: enable=cell-var-from-loop
-  return out_slices_list
+  if context.executing_eagerly():
+    # Collective ops will block unless they are executed concurrently such as in
+    # a graph or a defun.
+    collective_all_gather = def_function.function(collective_all_gather)
+  return collective_all_gather()
 
 
 def sum_grad_and_var_all_reduce(grad_and_vars,
@@ -894,31 +777,3 @@ def stitch_values(values_and_indices_list):
         assert result[i] is None
         result[i] = v
   return result
-
-
-def _control_input(inputs, control_inputs, idx):
-  """Returns the `idx`-th item in control_inputs to be used in ops.control_dependencies.
-
-  This is a helper function for building collective ops.  The function checks
-  that the devices of control_inputs and inputs match.
-
-  Args:
-    inputs: a list of `Tensor`s
-    control_inputs: a list or None.
-    idx: the index into `inputs` and `control_inputs`.
-
-  Returns:
-    A one item list of the `idx`-th element of `control_inputs`, or an empty
-    list if `control_inputs` is None.
-  """
-  if control_inputs is None:
-    return []
-  if len(control_inputs) != len(inputs):
-    raise ValueError(
-        'control_inputs must match the length of the inputs, %s != %s' %
-        (len(control_inputs), len(inputs)))
-  if control_inputs[idx].device != inputs[idx].device:
-    raise ValueError(
-        'control_inputs must match the device of the inputs, %s != %s' %
-        (control_inputs[idx].device, inputs[idx].device))
-  return control_inputs[idx]

From 513c39fc74b6d17fa9cef3a79749636ac5df7516 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Wed, 19 Feb 2020 16:59:23 -0800
Subject: [PATCH 300/442] Change to use assertLen in one test

due to pylint complaints

PiperOrigin-RevId: 296090392
Change-Id: I4faf420c108b011154b3f4a7c3eee68482ef9de4
---
 tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
index b345cb99b5d..2b74c3fa12f 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
@@ -242,7 +242,7 @@ class OptimizerTest(test.TestCase):
 
       sgd = gradient_descent.SGD(3.0)
       grads_and_vars = sgd._compute_gradients(f, [x])
-      self.assertEqual(1, len(grads_and_vars))
+      self.assertLen(grads_and_vars, 1)
       grad, x_as_var = grads_and_vars[0]
       self.assertIs(x, x_as_var)
       self.assertEqual(2.0, self.evaluate(grad))

From abaab5b360a042f9111f57bfb58de496dee3b88c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 17:01:57 -0800
Subject: [PATCH 301/442] Fix a bug in ctc_loss_dense with unique.

1. Unique label has to consider the case where blank label is not 0.

2. The scattering mechanism assumes that 0.0 always corresponds to a padding region, but this is not the case when there is only single valid path (0.0 = log(1.0)). This happen when the lengths of the logits and the label are the same.

PiperOrigin-RevId: 296090785
Change-Id: I803508252e688571bca531b1aa95dd2160902d4c
---
 .../python/kernel_tests/ctc_loss_op_test.py   | 98 +++++++++++++++++++
 tensorflow/python/ops/ctc_ops.py              | 21 +++-
 2 files changed, 117 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/ctc_loss_op_test.py b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
index e7f1f8a5e85..19918496fbd 100644
--- a/tensorflow/python/kernel_tests/ctc_loss_op_test.py
+++ b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
@@ -460,6 +460,69 @@ class CTCLossTestV2(test.TestCase):
         time_major=True)
     tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]
 
+    with self.cached_session():
+      for _ in range(32):
+        self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss]))
+        self.assertAllClose(
+            *self.evaluate([ctc_loss_grads, tf_nn_ctc_grads]),
+            rtol=2e-06,
+            atol=2e-06)
+
+  @test_util.run_v1_only("b/120545219")
+  def testCtcLossDenseUniqueFastPathWithBlankIndexIsSameAsCtcLoss(self):
+    random_seed.set_random_seed(5)
+
+    batch_size = 8
+    num_labels = 6
+    label_length = 5
+    num_frames = 12
+    logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
+    labels = random_ops.random_uniform([batch_size, label_length],
+                                       minval=0,
+                                       maxval=num_labels - 1,
+                                       dtype=dtypes.int64)
+
+    label_lengths = random_ops.random_uniform([batch_size],
+                                              minval=2,
+                                              maxval=label_length,
+                                              dtype=dtypes.int64)
+    label_mask = array_ops.sequence_mask(
+        label_lengths, maxlen=label_length, dtype=label_lengths.dtype)
+    labels *= label_mask
+
+    logit_lengths = [num_frames] * batch_size
+
+    tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32)
+    tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(tf_ctc_loss_labels,
+                                                        label_lengths)
+
+    tf_nn_ctc_loss = ctc_ops.ctc_loss(
+        labels=tf_ctc_loss_labels,
+        inputs=logits,
+        sequence_length=logit_lengths,
+        time_major=True)
+    tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]
+
+    # Shift the blank logits/labels to be somewhere in the middle.
+    blank_index = 2
+    shifted_logits = array_ops.concat([
+        logits[:, :, :blank_index],
+        logits[:, :, -1:],
+        logits[:, :, blank_index:-1],
+    ],
+                                      axis=2)
+    shifted_labels = array_ops.where_v2(labels < blank_index, labels,
+                                        labels + 1)
+
+    ctc_loss = ctc_ops.ctc_loss_dense(
+        labels=shifted_labels,
+        logits=shifted_logits,
+        label_length=label_lengths,
+        logit_length=logit_lengths,
+        blank_index=blank_index,
+        unique=ctc_ops.ctc_unique_labels(shifted_labels))
+    ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]
+
     with self.cached_session() as sess:
       for _ in range(32):
         self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss]))
@@ -773,6 +836,41 @@ class CTCLossTestV2(test.TestCase):
          [22.0 + 23.0 + 24.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
     ])
 
+  def testStateToOlabelUniqueSinglePath(self):
+    labels = [
+        [3, 4, 3],
+        [1, 0, 0],
+    ]
+    num_labels = 8
+
+    # 3 frames, 2 batch, 8 states (4 label, 4 blank).
+    #
+    # There is only single valid path for each sequence because the frame
+    # lengths and the label lengths are the same.
+    states = [[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+               [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+              [[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+               [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+              [[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
+               [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]]
+    labels = ops.convert_to_tensor(labels)
+    states = math_ops.log(states)
+    olabel = ctc_ops._state_to_olabel_unique(labels, num_labels, states,
+                                             ctc_ops.ctc_unique_labels(labels))
+    olabel = math_ops.exp(olabel)
+    blank = olabel[:, :, 0]
+
+    self.assertAllClose(blank, [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]])
+    self.assertAllClose(olabel[:, :, 1:],
+                        [
+                            [[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
+                             [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+                            [[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
+                             [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+                            [[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
+                             [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+                        ])
+
   @test_util.run_deprecated_v1
   def testScan(self):
     with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index 4b3a5dd7fe9..d18799c5224 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -601,9 +601,18 @@ def _state_to_olabel_unique(labels, num_labels, states, unique):
       updates=batch_state_major,
       shape=[batch_size * num_labels, num_frames])
   scatter = array_ops.reshape(scatter, [batch_size, num_labels, num_frames])
+
+  mask = array_ops.ones_like(batch_state_major, dtype=dtypes.bool)
+  mask = array_ops.scatter_nd(
+      indices=indices,
+      updates=mask,
+      shape=[batch_size * num_labels, num_frames])
+  mask = array_ops.reshape(mask, [batch_size, num_labels, num_frames])
+
   scatter = array_ops.where(
-      math_ops.equal(scatter, 0.0),
-      array_ops.fill(array_ops.shape(scatter), math_ops.log(0.0)), scatter)
+      mask, scatter,
+      array_ops.fill(array_ops.shape(scatter), math_ops.log(0.0)))
+
   label_olabels = array_ops.transpose(scatter, [2, 0, 1])
   label_olabels = label_olabels[:, :, 1:]
 
@@ -1010,6 +1019,14 @@ def ctc_loss_dense(labels,
 
     if unique:
       unique_y, unique_idx = unique
+      if blank_index != 0:
+        unique_y = array_ops.where(unique_y < blank_index, unique_y + 1,
+                                   unique_y)
+        label_mask_len = math_ops.reduce_max(unique_idx, axis=1) + 1
+        max_label_length = _get_dim(unique_y, 1)
+        label_mask = array_ops.sequence_mask(label_mask_len, max_label_length)
+        unique_y = array_ops.where(label_mask, unique_y,
+                                   array_ops.zeros_like(unique_y))
       args.extend([unique_y, unique_idx])
 
     @custom_gradient.custom_gradient

From 10666c59dd4858645d1b03ce01f4450da80710ec Mon Sep 17 00:00:00 2001
From: Thomas O'Malley <omalleyt@google.com>
Date: Wed, 19 Feb 2020 17:02:39 -0800
Subject: [PATCH 302/442] Keras ideal fit and compile.

Kept all new abstractions private for now. In a few weeks, if we're
comfortable that these abstractions are working and stable, we should expose
many of them publicly.

Capabilites added by this CL:

(1) Easy to create a custom training step via overriding Model._train_step
(2) Easy to create custom tf.function / DistStrat logic via overriding
Model._make_train_function
(3) Advanced users can override Model.compile and Model.fit
(4) Full support for dicts, nested structures, etc with Subclassed Models.
(5) "Power user" path (tf.data inputs) only modifies data in Model._train_step,
where this behavior is easy to override and disable. This applies even to
Keras's assumption that data is passed in (x, y, sample_weight) format.

Behavior changes:

(1) "loss" passed to Callbacks is now stateful (like all other metrics in
Callbacks). This greatly simplifies the training step logic and callback logic.
(2) ProgbarLogger always uses steps. If steps is not available, the
ProgbarLogger handles inferring the steps after the first epoch.
(3) validation_batch_size added in `fit`, rather than inferring from generator.
(4) Model.inputs, Model.outputs, Model.input_names, and Model.output_names are
no longer populated for subclassed Models. Instead, "pseudo" output names are
created for subclassed Models, which are only used for metrics names and
SavedModel's signature.
(5) Cast NumPy floats to backend.floatx(), otherwise leave
unchanged (this is likely not a change, we did something like this in our old
version but the logic was scattered in many places)

PiperOrigin-RevId: 296090972
Change-Id: Ia5ac833fd39085bddb016833bd338083d0dc5fc2
---
 .../debug/lib/distributed_callbacks_test.py   |    4 +-
 .../python/distribute/keras_save_load_test.py |    8 +-
 .../model_collection/simple_models.py         |    6 +-
 .../distribute/saved_model_mixed_api_test.py  |    8 +-
 .../distribute/saved_model_save_load_test.py  |   16 +-
 .../distribute/saved_model_test_base.py       |   18 +-
 tensorflow/python/eager/forwardprop.py        |    4 +-
 tensorflow/python/eager/forwardprop_test.py   |    2 +-
 tensorflow/python/eager/function.py           |    3 +-
 tensorflow/python/keras/backend.py            |    4 +
 tensorflow/python/keras/callbacks.py          |  246 +-
 tensorflow/python/keras/callbacks_test.py     |  107 +-
 .../distribute/distribute_strategy_test.py    |   28 +-
 .../keras/distribute/keras_utils_test.py      |   70 +-
 tensorflow/python/keras/engine/BUILD          |   20 -
 tensorflow/python/keras/engine/base_layer.py  |   51 +-
 .../python/keras/engine/base_layer_test.py    |   44 +-
 .../python/keras/engine/compile_utils.py      |  269 +-
 .../python/keras/engine/compile_utils_test.py |   65 +-
 .../python/keras/engine/data_adapter.py       |  448 ++-
 .../python/keras/engine/data_adapter_test.py  |   59 +-
 tensorflow/python/keras/engine/network.py     |   68 +-
 tensorflow/python/keras/engine/sequential.py  |   18 +-
 .../python/keras/engine/sequential_test.py    |   64 +-
 tensorflow/python/keras/engine/training.py    | 2712 ++++-------------
 .../python/keras/engine/training_arrays.py    |   18 +-
 .../keras/engine/training_dataset_test.py     |   43 +-
 .../keras/engine/training_eager_test.py       |    9 +-
 .../python/keras/engine/training_generator.py |   18 +-
 .../keras/engine/training_generator_test.py   |   38 +-
 .../python/keras/engine/training_test.py      |  999 +-----
 tensorflow/python/keras/engine/training_v1.py |   69 +-
 tensorflow/python/keras/engine/training_v2.py |  778 -----
 .../python/keras/engine/training_v2_utils.py  |  556 ----
 .../keras/engine/training_v2_utils_test.py    |  160 -
 tensorflow/python/keras/layers/core.py        |   19 +-
 tensorflow/python/keras/layers/merge.py       |   20 +-
 .../python/keras/layers/normalization_test.py |    4 +-
 .../preprocessing/normalization_test.py       |   32 +-
 .../python/keras/layers/wrappers_test.py      |   47 +-
 tensorflow/python/keras/losses.py             |   14 +-
 tensorflow/python/keras/metrics.py            |   13 +-
 .../python/keras/metrics_correctness_test.py  |   99 +-
 tensorflow/python/keras/models.py             |   50 +-
 tensorflow/python/keras/models_test.py        |    8 +-
 tensorflow/python/keras/premade/linear.py     |    2 +-
 tensorflow/python/keras/premade/wide_deep.py  |   56 +-
 .../python/keras/premade/wide_deep_test.py    |    2 -
 .../python/keras/saving/hdf5_format_test.py   |   26 +-
 .../keras/saving/losses_serialization_test.py |   16 +-
 .../saving/metrics_serialization_test.py      |   11 -
 .../python/keras/saving/saved_model/load.py   |    7 +-
 .../keras/saving/saved_model/revive_test.py   |   26 +-
 .../keras/saving/saved_model/save_impl.py     |   29 +-
 .../saving/saved_model/saved_model_test.py    |   34 +-
 .../saving/saved_model_experimental_test.py   |   21 +-
 .../python/keras/saving/saving_utils.py       |  216 +-
 .../python/keras/saving/saving_utils_test.py  |   58 +-
 tensorflow/python/keras/testing_utils.py      |    3 +
 .../tests/model_subclassing_compiled_test.py  |    2 -
 .../keras/tests/model_subclassing_test.py     |    7 +-
 ...emporal_sample_weights_correctness_test.py |   45 +-
 .../utils/composite_tensor_support_test.py    |  113 +-
 .../python/keras/utils/generic_utils.py       |   34 +-
 tensorflow/python/keras/utils/layer_utils.py  |    1 -
 tensorflow/python/keras/utils/tf_utils.py     |   25 +
 .../python/keras/utils/tf_utils_test.py       |    2 +
 tensorflow/python/layers/base.py              |    2 +-
 .../golden/v1/tensorflow.keras.-model.pbtxt   |    8 +-
 .../v1/tensorflow.keras.-sequential.pbtxt     |    8 +-
 ...low.keras.experimental.-linear-model.pbtxt |    8 +-
 ....keras.experimental.-wide-deep-model.pbtxt |    8 +-
 .../v1/tensorflow.keras.models.-model.pbtxt   |    8 +-
 .../tensorflow.keras.models.-sequential.pbtxt |    8 +-
 .../v1/tensorflow.keras.utils.-progbar.pbtxt  |    2 +-
 .../golden/v2/tensorflow.keras.-model.pbtxt   |    8 +-
 .../v2/tensorflow.keras.-sequential.pbtxt     |    8 +-
 ...low.keras.experimental.-linear-model.pbtxt |    8 +-
 ....keras.experimental.-wide-deep-model.pbtxt |    8 +-
 .../v2/tensorflow.keras.models.-model.pbtxt   |    8 +-
 .../tensorflow.keras.models.-sequential.pbtxt |    8 +-
 .../v2/tensorflow.keras.utils.-progbar.pbtxt  |    2 +-
 82 files changed, 2215 insertions(+), 5959 deletions(-)
 delete mode 100644 tensorflow/python/keras/engine/training_v2.py
 delete mode 100644 tensorflow/python/keras/engine/training_v2_utils.py
 delete mode 100644 tensorflow/python/keras/engine/training_v2_utils_test.py

diff --git a/tensorflow/python/debug/lib/distributed_callbacks_test.py b/tensorflow/python/debug/lib/distributed_callbacks_test.py
index 4b1eb3e498a..606f14b3230 100644
--- a/tensorflow/python/debug/lib/distributed_callbacks_test.py
+++ b/tensorflow/python/debug/lib/distributed_callbacks_test.py
@@ -195,6 +195,7 @@ class DistributedDumpingCallbackTest(
           self.assertAllClose(device_1_matmul_values[0], [[10.0]])
           self.assertAllClose(device_1_bias_add_values[0], [[11.0]])
 
+  # TODO(b/148461691): Fix for new Keras internals.
   @combinations.generate(
       combinations.combine(
           distribution=[
@@ -206,7 +207,8 @@ class DistributedDumpingCallbackTest(
           mode=["eager"],
           tensor_debug_mode=["NO_TENSOR", "FULL_TENSOR"],
       ))
-  def testKerasModelFitOnOneOrTwoDevices(self, distribution, tensor_debug_mode):
+  def DISABLED_testKerasModelFitOnOneOrTwoDevices(self, distribution,
+                                                  tensor_debug_mode):
     writer = dumping_callback.enable_dump_debug_info(
         self.dump_root, tensor_debug_mode=tensor_debug_mode)
 
diff --git a/tensorflow/python/distribute/keras_save_load_test.py b/tensorflow/python/distribute/keras_save_load_test.py
index 494a348d050..6475406eb4b 100644
--- a/tensorflow/python/distribute/keras_save_load_test.py
+++ b/tensorflow/python/distribute/keras_save_load_test.py
@@ -33,8 +33,12 @@ class KerasSaveLoadTest(test_base.TestSavedModelBase):
   def _save_model(self, model, saved_dir):
     model.save(saved_dir, save_format='tf')
 
-  def _load_and_run_model(self, distribution, saved_dir, predict_dataset,
-                          output_name, experimental_run_tf_function):
+  def _load_and_run_model(self,
+                          distribution,
+                          saved_dir,
+                          predict_dataset,
+                          experimental_run_tf_function,
+                          output_name='output_1'):
     restored_keras_model = save.load_model(saved_dir)
     restored_keras_model._experimental_run_tf_function = (
         experimental_run_tf_function)
diff --git a/tensorflow/python/distribute/model_collection/simple_models.py b/tensorflow/python/distribute/model_collection/simple_models.py
index 63a2bfcb520..ededb0a7f59 100644
--- a/tensorflow/python/distribute/model_collection/simple_models.py
+++ b/tensorflow/python/distribute/model_collection/simple_models.py
@@ -45,7 +45,7 @@ class SimpleFunctionalModel(model_collection_base.ModelAndInput):
   """A simple functional model and its inputs."""
 
   def get_model(self, **kwargs):
-    output_name = 'output_layer'
+    output_name = 'output_1'
 
     x = keras.layers.Input(shape=(3,), dtype=dtypes.float32)
     y = keras.layers.Dense(5, dtype=dtypes.float32, name=output_name)(x)
@@ -74,7 +74,7 @@ class SimpleSequentialModel(model_collection_base.ModelAndInput):
   """A simple sequential model and its inputs."""
 
   def get_model(self, **kwargs):
-    output_name = 'output_layer'
+    output_name = 'output_1'
 
     model = keras.Sequential()
     y = keras.layers.Dense(
@@ -106,7 +106,7 @@ class _SimpleModel(keras.Model):
     self._dense_layer = keras.layers.Dense(5, dtype=dtypes.float32)
 
   def call(self, inputs):
-    return {'output_layer': self._dense_layer(inputs)}
+    return self._dense_layer(inputs)
 
 
 class SimpleSubclassModel(model_collection_base.ModelAndInput):
diff --git a/tensorflow/python/distribute/saved_model_mixed_api_test.py b/tensorflow/python/distribute/saved_model_mixed_api_test.py
index 2b0e5e9e899..240f5f45f9f 100644
--- a/tensorflow/python/distribute/saved_model_mixed_api_test.py
+++ b/tensorflow/python/distribute/saved_model_mixed_api_test.py
@@ -41,8 +41,12 @@ class SavedModelSaveAndLoadTest(test_base.TestSavedModelBase):
   def _save_model(self, model, saved_dir):
     keras_saved_model.export_saved_model(model, saved_dir, serving_only=True)
 
-  def _load_and_run_model(self, distribution, saved_dir, predict_dataset,
-                          output_name, experimental_run_tf_function):
+  def _load_and_run_model(self,
+                          distribution,
+                          saved_dir,
+                          predict_dataset,
+                          experimental_run_tf_function,
+                          output_name='output_1'):
     return test_base.load_and_run_with_saved_model_api(distribution, saved_dir,
                                                        predict_dataset,
                                                        output_name)
diff --git a/tensorflow/python/distribute/saved_model_save_load_test.py b/tensorflow/python/distribute/saved_model_save_load_test.py
index 5380d6f9d1f..10dae8065bb 100644
--- a/tensorflow/python/distribute/saved_model_save_load_test.py
+++ b/tensorflow/python/distribute/saved_model_save_load_test.py
@@ -35,8 +35,12 @@ class SavedModelKerasModelTest(test_base.TestSavedModelBase):
   def _save_model(self, model, saved_dir):
     saved_model.save(model, saved_dir)
 
-  def _load_and_run_model(self, distribution, saved_dir, predict_dataset,
-                          output_name, experimental_run_tf_function):
+  def _load_and_run_model(self,
+                          distribution,
+                          saved_dir,
+                          predict_dataset,
+                          experimental_run_tf_function,
+                          output_name='output_1'):
     return test_base.load_and_run_with_saved_model_api(distribution, saved_dir,
                                                        predict_dataset,
                                                        output_name)
@@ -100,8 +104,12 @@ class SavedModelTFModuleTest(test_base.TestSavedModelBase):
     call = model.__call__.get_concrete_function(tensor_spec.TensorSpec(None))
     saved_model.save(model, saved_dir, signatures=call)
 
-  def _load_and_run_model(self, distribution, saved_dir, predict_dataset,
-                          output_name, experimental_run_tf_function):
+  def _load_and_run_model(self,
+                          distribution,
+                          saved_dir,
+                          predict_dataset,
+                          experimental_run_tf_function,
+                          output_name='output_1'):
     del output_name, experimental_run_tf_function
     model = saved_model.load(saved_dir)
     return self._predict_with_model(distribution, model, predict_dataset)
diff --git a/tensorflow/python/distribute/saved_model_test_base.py b/tensorflow/python/distribute/saved_model_test_base.py
index 832bb4f1dbd..5d3511c6cde 100644
--- a/tensorflow/python/distribute/saved_model_test_base.py
+++ b/tensorflow/python/distribute/saved_model_test_base.py
@@ -150,8 +150,12 @@ class TestSavedModelBase(test.TestCase, parameterized.TestCase):
     """
     raise NotImplementedError('must be implemented in descendants')
 
-  def _load_and_run_model(self, distribution, saved_dir, predict_dataset,
-                          output_name, experimental_run_tf_function):
+  def _load_and_run_model(self,
+                          distribution,
+                          saved_dir,
+                          predict_dataset,
+                          experimental_run_tf_function,
+                          output_name='output_1'):
     """Load the model and run 1 step of predict with it.
 
     This method must be implemented by the subclasses.
@@ -162,10 +166,10 @@ class TestSavedModelBase(test.TestCase, parameterized.TestCase):
       saved_dir: the string representing the path where the model is saved.
       predict_dataset: the data used to do the predict on the model for
         cross_replica context.
-      output_name: the string representing the name of the output layer of the
-        model.
       experimental_run_tf_function: Whether to use the single execution path
         for models.
+      output_name: the string representing the name of the output layer of the
+        model.
     """
 
     raise NotImplementedError('must be implemented in descendants')
@@ -211,10 +215,6 @@ class TestSavedModelBase(test.TestCase, parameterized.TestCase):
           distribution=distribution,
           saved_dir=saved_dir,
           predict_dataset=predict_dataset,
-          # Note that subclassed model's output names aren't defined until after
-          # the model is built (in these tests, this occurs when the model is
-          # trained).
-          output_name=getattr(model, 'output_names', [None])[0],
           experimental_run_tf_function=experimental_run_tf_function)
 
     tolerance = get_tolerance(None, distribution)
@@ -248,7 +248,6 @@ class TestSavedModelBase(test.TestCase, parameterized.TestCase):
         distribution=None,
         saved_dir=saved_dir,
         predict_dataset=predict_dataset,
-        output_name=getattr(model, 'output_names', [None])[0],
         experimental_run_tf_function=experimental_run_tf_function)
 
     tolerance = get_tolerance(distribution, None)
@@ -285,7 +284,6 @@ class TestSavedModelBase(test.TestCase, parameterized.TestCase):
           distribution=distribution_for_restoring,
           saved_dir=saved_dir,
           predict_dataset=predict_dataset,
-          output_name=getattr(model, 'output_names', [None])[0],
           experimental_run_tf_function=experimental_run_tf_function)
 
     tolerance = get_tolerance(distribution_for_saving,
diff --git a/tensorflow/python/eager/forwardprop.py b/tensorflow/python/eager/forwardprop.py
index 973e130ef0f..0bb1e89e4a3 100644
--- a/tensorflow/python/eager/forwardprop.py
+++ b/tensorflow/python/eager/forwardprop.py
@@ -186,7 +186,7 @@ class ForwardAccumulator(object):
 
   >>> x = tf.constant([[2.0, 3.0], [1.0, 4.0]])
   >>> dense = tf.keras.layers.Dense(1)
-  >>> dense.build([2])
+  >>> dense.build([None, 2])
   >>> with tf.autodiff.ForwardAccumulator(
   ...    primals=dense.kernel,
   ...    tangents=tf.constant([[1.], [0.]])) as acc:
@@ -210,7 +210,7 @@ class ForwardAccumulator(object):
 
   >>> x = tf.constant([[2.0, 3.0], [1.0, 4.0]])
   >>> dense = tf.keras.layers.Dense(1)
-  >>> dense.build([2])
+  >>> dense.build([None, 2])
   >>> loss_fn = lambda: tf.reduce_sum((dense(x) - tf.constant([1., -1.])) ** 2.)
   >>> kernel_fprop = []
   >>> with tf.autodiff.ForwardAccumulator(
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index 79c0714c720..fed04aec270 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -1067,7 +1067,7 @@ class HessianTests(test.TestCase, parameterized.TestCase):
        ("MapFn", False)])
   def testHessianOfVariables(self, use_pfor):
     model = core.Dense(1)
-    model.build([2])
+    model.build([None, 2])
 
     def _loss(*unused_args):
       input_value = constant_op.constant([[-0.5, 1.], [0.5, -1.]])
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 76e036da74e..895a5de7765 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -2271,7 +2271,8 @@ def _convert_inputs_to_signature(inputs, input_signature, flat_input_signature):
     flatten_inputs = nest.flatten_up_to(
         input_signature,
         inputs[:len(input_signature)],
-        expand_composites=True)
+        expand_composites=True,
+        check_types=False)  # lists are convert to tuples for `tf.data`.
   except ValueError:
     raise ValueError("Structure of Python function inputs does not match "
                      "input_signature:\n%s" %
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 81323613231..50856e1f173 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -4347,6 +4347,10 @@ def in_train_phase(x, alt, training=None):
       Either `x` or `alt` based on the `training` flag.
       the `training` flag defaults to `K.learning_phase()`.
   """
+  from tensorflow.python.keras.engine import base_layer_utils  # pylint: disable=g-import-not-at-top
+  if training is None:
+    training = base_layer_utils.call_context().training
+
   if training is None:
     training = learning_phase()
 
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 6fd3e0e902d..5fae5eb9218 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -49,6 +49,7 @@ from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
+from tensorflow.python.util import nest
 from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
@@ -187,26 +188,67 @@ def make_logs(model, logs, outputs, mode, prefix=''):
 
 
 class CallbackList(object):
-  """Container abstracting a list of callbacks.
+  """Container abstracting a list of callbacks."""
 
-  Arguments:
+  def __init__(self,
+               callbacks=None,
+               add_history=False,
+               add_progbar=False,
+               model=None,
+               **params):
+    """Creates a container for `Callbacks`.
+
+    Arguments:
       callbacks: List of `Callback` instances.
-      queue_length: Queue length for keeping
-          running statistics over callback execution time.
-  """
+      add_history: Whether a `History` callback should be added, if one does not
+        already exist in `callback`s.
+      add_progbar: Whether a `ProgbarLogger` callback should be added, if one
+        does not already exist in `callback`s.
+      model: The `Model` these `Callback`s are used with.`
+      **params: If provided, parameters will be passed to each `Callback` via
+        `Callback.set_params`.
+    """
+    self.callbacks = nest.flatten(callbacks) if callbacks else []
+    self._add_default_callbacks(add_history, add_progbar)
 
-  def __init__(self, callbacks=None, queue_length=10):
-    callbacks = callbacks or []
-    self.callbacks = [c for c in callbacks]
-    self.queue_length = queue_length
-    self.params = {}
-    self.model = None
+    if model:
+      self.set_model(model)
+    if params:
+      self.set_params(params)
+
+    self._queue_length = 10
     self._reset_batch_timing()
 
+  def _add_default_callbacks(self, add_history, add_progbar):
+    """Adds `Callback`s that are always present."""
+    self._progbar = None
+    self._history = None
+
+    for cb in self.callbacks:
+      if isinstance(cb, ProgbarLogger):
+        self._progbar = cb
+      elif isinstance(cb, History):
+        self._history = cb
+
+    if self._progbar is None and add_progbar:
+      self._progbar = ProgbarLogger(count_mode='steps')
+      self.callbacks.append(self._progbar)
+
+    if self._history is None and add_history:
+      self._history = History()
+      self.callbacks.append(self._history)
+
   def _reset_batch_timing(self):
     self._delta_t_batch = 0.
     self._delta_ts = collections.defaultdict(
-        lambda: collections.deque([], maxlen=self.queue_length))
+        lambda: collections.deque([], maxlen=self._queue_length))
+
+  def _process_logs(self, logs):
+    if logs:
+      return {
+          k: v.numpy() if hasattr(v, 'numpy') else v for k, v in logs.items()
+      }
+    return {}
 
   def append(self, callback):
     self.callbacks.append(callback)
@@ -218,6 +260,8 @@ class CallbackList(object):
 
   def set_model(self, model):
     self.model = model
+    if self._history:
+      model.history = self._history
     for callback in self.callbacks:
       callback.set_model(model)
 
@@ -266,9 +310,11 @@ class CallbackList(object):
       self.on_predict_end()
 
   def on_batch_begin(self, batch, logs=None):
+    logs = self._process_logs(logs)
     self._call_batch_hook(ModeKeys.TRAIN, 'begin', batch, logs=logs)
 
   def on_batch_end(self, batch, logs=None):
+    logs = self._process_logs(logs)
     self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
 
   def on_epoch_begin(self, epoch, logs=None):
@@ -281,7 +327,7 @@ class CallbackList(object):
         logs: dict. Currently no data is passed to this argument for this method
           but that may change in the future.
     """
-    logs = logs or {}
+    logs = self._process_logs(logs)
     for callback in self.callbacks:
       callback.on_epoch_begin(epoch, logs)
     self._reset_batch_timing()
@@ -297,7 +343,7 @@ class CallbackList(object):
           validation epoch if validation is performed. Validation result keys
           are prefixed with `val_`.
     """
-    logs = logs or {}
+    logs = self._process_logs(logs)
     for callback in self.callbacks:
       callback.on_epoch_end(epoch, logs)
 
@@ -309,6 +355,7 @@ class CallbackList(object):
         logs: dict. Has keys `batch` and `size` representing the current batch
           number and the size of the batch.
     """
+    logs = self._process_logs(logs)
     self._call_batch_hook(ModeKeys.TRAIN, 'begin', batch, logs=logs)
 
   def on_train_batch_end(self, batch, logs=None):
@@ -318,6 +365,7 @@ class CallbackList(object):
         batch: integer, index of batch within the current epoch.
         logs: dict. Metric results for this batch.
     """
+    logs = self._process_logs(logs)
     self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
 
   def on_test_batch_begin(self, batch, logs=None):
@@ -328,6 +376,7 @@ class CallbackList(object):
         logs: dict. Has keys `batch` and `size` representing the current batch
           number and the size of the batch.
     """
+    logs = self._process_logs(logs)
     self._call_batch_hook(ModeKeys.TEST, 'begin', batch, logs=logs)
 
   def on_test_batch_end(self, batch, logs=None):
@@ -347,6 +396,7 @@ class CallbackList(object):
         logs: dict. Has keys `batch` and `size` representing the current batch
           number and the size of the batch.
     """
+    logs = self._process_logs(logs)
     self._call_batch_hook(ModeKeys.PREDICT, 'begin', batch, logs=logs)
 
   def on_predict_batch_end(self, batch, logs=None):
@@ -356,6 +406,7 @@ class CallbackList(object):
         batch: integer, index of batch within the current epoch.
         logs: dict. Metric results for this batch.
     """
+    logs = self._process_logs(logs)
     self._call_batch_hook(ModeKeys.PREDICT, 'end', batch, logs=logs)
 
   def on_train_begin(self, logs=None):
@@ -365,6 +416,7 @@ class CallbackList(object):
         logs: dict. Currently no data is passed to this argument for this method
           but that may change in the future.
     """
+    logs = self._process_logs(logs)
     for callback in self.callbacks:
       callback.on_train_begin(logs)
 
@@ -375,6 +427,7 @@ class CallbackList(object):
         logs: dict. Currently no data is passed to this argument for this method
           but that may change in the future.
     """
+    logs = self._process_logs(logs)
     for callback in self.callbacks:
       callback.on_train_end(logs)
 
@@ -385,6 +438,7 @@ class CallbackList(object):
         logs: dict. Currently no data is passed to this argument for this method
           but that may change in the future.
     """
+    logs = self._process_logs(logs)
     for callback in self.callbacks:
       callback.on_test_begin(logs)
 
@@ -395,6 +449,7 @@ class CallbackList(object):
         logs: dict. Currently no data is passed to this argument for this method
           but that may change in the future.
     """
+    logs = self._process_logs(logs)
     for callback in self.callbacks:
       callback.on_test_end(logs)
 
@@ -405,6 +460,7 @@ class CallbackList(object):
         logs: dict. Currently no data is passed to this argument for this method
           but that may change in the future.
     """
+    logs = self._process_logs(logs)
     for callback in self.callbacks:
       callback.on_predict_begin(logs)
 
@@ -415,6 +471,7 @@ class CallbackList(object):
         logs: dict. Currently no data is passed to this argument for this method
           but that may change in the future.
     """
+    logs = self._process_logs(logs)
     for callback in self.callbacks:
       callback.on_predict_end(logs)
 
@@ -721,6 +778,7 @@ class ProgbarLogger(Callback):
           should *not* be averaged over an epoch.
           Metrics in this list will be logged as-is.
           All others will be averaged over time (e.g. loss, etc).
+          If not provided, defaults to the `Model`'s metrics.
 
   Raises:
       ValueError: In case of invalid `count_mode`.
@@ -734,59 +792,96 @@ class ProgbarLogger(Callback):
       self.use_steps = True
     else:
       raise ValueError('Unknown `count_mode`: ' + str(count_mode))
-    self.stateful_metrics = set(stateful_metrics or [])
-    self.log_values = None
+    # Defaults to all Model's metrics except for loss.
+    self.stateful_metrics = set(stateful_metrics) if stateful_metrics else None
+
+    self.seen = 0
+    self.progbar = None
+    self.target = None
+    self.verbose = 1
+    self.epochs = 1
+
+    self._called_in_fit = False
+
+  def set_params(self, params):
+    self.verbose = params['verbose']
+    self.epochs = params['epochs']
+    if self.use_steps and 'steps' in params:
+      self.target = params['steps']
+    elif not self.use_steps and 'samples' in params:
+      self.target = params['samples']
+    else:
+      self.target = None  # Will be inferred at the end of the first epoch.
 
   def on_train_begin(self, logs=None):
-    self.verbose = self.params['verbose']
-    self.epochs = self.params['epochs']
+    # When this logger is called inside `fit`, validation is silent.
+    self._called_in_fit = True
+
+  def on_test_begin(self, logs=None):
+    if not self._called_in_fit:
+      self._reset_progbar()
+
+  def on_predict_begin(self, logs=None):
+    self._reset_progbar()
 
   def on_epoch_begin(self, epoch, logs=None):
-    self.seen = 0
-    if self.use_steps:
-      self.target = self.params['steps']
-    else:
-      self.target = self.params['samples']
+    self._reset_progbar()
+    if self.verbose and self.epochs > 1:
+      print('Epoch %d/%d' % (epoch + 1, self.epochs))
 
-    if self.verbose:
-      if self.epochs > 1:
-        print('Epoch %d/%d' % (epoch + 1, self.epochs))
-    self.progbar = Progbar(
-        target=self.target,
-        verbose=self.verbose,
-        stateful_metrics=self.stateful_metrics,
-        unit_name='step' if self.use_steps else 'sample')
+  def on_train_batch_end(self, batch, logs=None):
+    self._batch_update_progbar(logs)
 
-  def on_batch_begin(self, batch, logs=None):
-    self.log_values = []
+  def on_test_batch_end(self, batch, logs=None):
+    if not self._called_in_fit:
+      self._batch_update_progbar(logs)
 
-  def on_batch_end(self, batch, logs=None):
-    logs = logs or {}
-    batch_size = logs.get('size', 0)
-    # In case of distribution strategy we can potentially run multiple steps
-    # at the same time, we should account for that in the `seen` calculation.
-    num_steps = logs.get('num_steps', 1)
-    if self.use_steps:
-      self.seen += num_steps
-    else:
-      self.seen += batch_size * num_steps
-
-    for k in self.params['metrics']:
-      if k in logs:
-        self.log_values.append((k, logs[k]))
-
-    # Skip progbar update for the last batch;
-    # will be handled by on_epoch_end.
-    if self.verbose and (self.target is None or self.seen < self.target):
-      self.progbar.update(self.seen, self.log_values)
+  def on_predict_batch_end(self, batch, logs=None):
+    self._batch_update_progbar(None)  # Don't pass prediction results.
 
   def on_epoch_end(self, epoch, logs=None):
+    self._finalize_progbar(logs)
+
+  def on_test_end(self, logs=None):
+    if not self._called_in_fit:
+      self._finalize_progbar(logs)
+
+  def on_predict_end(self, logs=None):
+    self._finalize_progbar(logs)
+
+  def _reset_progbar(self):
+    self.seen = 0
+    self.progbar = None
+
+  def _batch_update_progbar(self, logs=None):
+    """Updates the progbar."""
+    if self.stateful_metrics is None:
+      if self.model:
+        self.stateful_metrics = (set(m.name for m in self.model.metrics))
+      else:
+        self.stateful_metrics = set()
+
+    if self.progbar is None:
+      self.progbar = Progbar(
+          target=self.target,
+          verbose=self.verbose,
+          stateful_metrics=self.stateful_metrics,
+          unit_name='step' if self.use_steps else 'sample')
+
+    logs = copy.copy(logs) if logs else {}
+    batch_size = logs.pop('size', 0)
+    num_steps = logs.pop('num_steps', 1)  # DistStrat can run >1 steps.
+    logs.pop('batch', None)
+    add_seen = num_steps if self.use_steps else num_steps * batch_size
+    self.seen += add_seen
+    self.progbar.update(self.seen, list(logs.items()), finalize=False)
+
+  def _finalize_progbar(self, logs):
+    if self.target is None:
+      self.target = self.seen
+      self.progbar.target = self.seen
     logs = logs or {}
-    for k in self.params['metrics']:
-      if k in logs:
-        self.log_values.append((k, logs[k]))
-    if self.verbose:
-      self.progbar.update(self.seen, self.log_values)
+    self.progbar.update(self.seen, list(logs.items()), finalize=True)
 
 
 @keras_export('keras.callbacks.History')
@@ -826,7 +921,7 @@ class ModelCheckpoint(Callback):
   - Definition of 'best'; which quantity to monitor and whether it should be
     maximized or minimized.
   - The frequency it should save at. Currently, the callback supports saving at
-    the end of every epoch, or after a fixed number of training samples.
+    the end of every epoch, or after a fixed number of training batches.
   - Whether only weights are saved, or the whole model is saved.
 
   Example:
@@ -873,11 +968,10 @@ class ModelCheckpoint(Callback):
         (`model.save(filepath)`).
       save_freq: `'epoch'` or integer. When using `'epoch'`, the callback saves
         the model after each epoch. When using integer, the callback saves the
-        model at end of a batch at which this many samples have been seen since
-        last saving. Note that if the saving isn't aligned to epochs, the
-        monitored metric may potentially be less reliable (it could reflect as
-        little as 1 batch, since the metrics get reset every epoch). Defaults to
-        `'epoch'`
+        model at end of this many batches. Note that if the saving isn't aligned
+        to epochs, the monitored metric may potentially be less reliable (it
+        could reflect as little as 1 batch, since the metrics get reset every
+        epoch). Defaults to `'epoch'`
       **kwargs: Additional arguments for backwards compatibility. Possible key
         is `period`.
   """
@@ -899,7 +993,7 @@ class ModelCheckpoint(Callback):
     self.save_weights_only = save_weights_only
     self.save_freq = save_freq
     self.epochs_since_last_save = 0
-    self._samples_seen_since_last_saving = 0
+    self._batches_seen_since_last_saving = 0
 
     # Deprecated field `load_weights_on_restart` is for loading the checkpoint
     # file from `filepath` at the start of `model.fit()`
@@ -917,7 +1011,7 @@ class ModelCheckpoint(Callback):
     if 'period' in kwargs:
       self.period = kwargs['period']
       logging.warning('`period` argument is deprecated. Please use `save_freq` '
-                      'to specify the frequency in number of samples seen.')
+                      'to specify the frequency in number of batches seen.')
     else:
       self.period = 1
 
@@ -1000,15 +1094,15 @@ class ModelCheckpoint(Callback):
         # Restore the training state so the model is ready for next (possible)
         # multi worker training.
         del self._training_state
-        del self.model._training_state
+        self.model._training_state = None
 
   def on_batch_end(self, batch, logs=None):
     logs = logs or {}
     if isinstance(self.save_freq, int):
-      self._samples_seen_since_last_saving += logs.get('size', 1)
-      if self._samples_seen_since_last_saving >= self.save_freq:
+      self._batches_seen_since_last_saving += 1
+      if self._batches_seen_since_last_saving >= self.save_freq:
         self._save_model(epoch=self._current_epoch, logs=logs)
-        self._samples_seen_since_last_saving = 0
+        self._batches_seen_since_last_saving = 0
 
   def on_epoch_begin(self, epoch, logs=None):
     self._current_epoch = epoch
@@ -1228,16 +1322,10 @@ class EarlyStopping(Callback):
   >>> model = tf.keras.models.Sequential([tf.keras.layers.Dense(10)])
   >>> model.compile(tf.keras.optimizers.SGD(), loss='mse')
   >>> history = model.fit(np.arange(100).reshape(5, 20), np.zeros(5),
-  ...                     epochs=10, callbacks=[callback])
-      Train on 5 samples
-      Epoch 1/10
-  5/5 [==============================] - ... loss: 6533.1904
-      Epoch 2/10
-  5/5 [==============================] - ... loss: 110183360.0000
-      Epoch 3/10
-  5/5 [==============================] - ... loss: 1862575718400.0000
-      Epoch 4/10
-  5/5 [==============================] - ... loss: 31485597793124352.0000
+  ...                     epochs=10, batch_size=1, callbacks=[callback],
+  ...                     verbose=0)
+  >>> len(history.history['loss'])  # Only 4 epochs are run.
+  4
   """
 
   def __init__(self,
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 6e5066e19ed..bf6d8cda6f2 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -35,6 +35,7 @@ import numpy as np
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import random_seed
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
@@ -146,9 +147,10 @@ class CallbackCountsTest(keras_parameterized.TestCase):
   @parameterized.named_parameters(('with_numpy', _get_numpy()),
                                   ('with_sequence', _get_sequence()))
   def test_callback_hooks_are_called_in_fit(self, data):
+    if not context.executing_eagerly():
+      self.skipTest('Behavior changed in v2.')
     x, y = data
     val_x, val_y = np.ones((4, 10)), np.ones((4, 1))
-    is_sequence = isinstance(x, keras.utils.data_utils.Sequence)
 
     model = self._get_model()
     counter = Counter()
@@ -156,8 +158,8 @@ class CallbackCountsTest(keras_parameterized.TestCase):
         x,
         y,
         validation_data=(val_x, val_y),
-        batch_size=2 if not is_sequence else None,
-        steps_per_epoch=5 if is_sequence else None,
+        batch_size=2,
+        steps_per_epoch=5,
         epochs=5,
         callbacks=[counter])
 
@@ -264,8 +266,8 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
   def test_progbar_logging(self):
     model = self._get_model(input_shape=(3,))
 
-    x = array_ops.ones((50, 3))
-    y = array_ops.zeros((50, 2))
+    x = array_ops.ones((200, 3))
+    y = array_ops.zeros((200, 2))
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(10)
     expected_log = r'(.*- loss:.*- my_acc:.*)+'
 
@@ -279,8 +281,8 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
     model = self._get_model()
     self.assertFalse(model.built)
 
-    x = array_ops.ones((50, 3))
-    y = array_ops.zeros((50, 2))
+    x = array_ops.ones((200, 3))
+    y = array_ops.zeros((200, 2))
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(10)
     expected_log = r'(.*- loss:.*- my_acc:.*)+'
 
@@ -304,15 +306,15 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
       self.assertRegexpMatches(printed.contents(), expected_log)
 
   @keras_parameterized.run_with_all_model_types
-  @keras_parameterized.run_all_keras_modes
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_progbar_logging_validation_split(self):
     model = self._get_model(input_shape=(3,))
 
     x = np.ones((100, 3))
     y = np.zeros((100, 2))
     expected_log = (
-        r'(?s).*1/2.*80/80.*- loss:.*- my_acc:.*- val_loss:.*- val_my_acc:'
-        r'.*2/2.*80/80.*- loss:.*- my_acc:.*- val_loss:.*- val_my_acc:.*')
+        r'(?s).*1/2.*8/8.*- loss:.*- my_acc:.*- val_loss:.*- val_my_acc:'
+        r'.*2/2.*8/8.*- loss:.*- my_acc:.*- val_loss:.*- val_my_acc:.*')
 
     with self.captureWritesToStream(sys.stdout) as printed:
       model.fit(x, y, batch_size=10, epochs=2, validation_split=0.2)
@@ -587,7 +589,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
             monitor=monitor,
             save_best_only=save_best_only,
             mode=mode,
-            save_freq=30,
+            save_freq=15,
             period=100)  # The period should be ignored (this test tests this).
     ]
     assert not os.path.exists(filepath.format(epoch=3))
@@ -638,8 +640,8 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
 
     def get_input_datasets():
       # Simple training input.
-      train_input = [[1]] * 16
-      train_label = [[0]] * 16
+      train_input = [[1.]] * 16
+      train_label = [[0.]] * 16
       ds = dataset_ops.Dataset.from_tensor_slices((train_input, train_label))
       return ds.batch(8, drop_remainder=True)
 
@@ -1268,40 +1270,40 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
             values.append(x)
       assert 'nan' in values[-1], 'The last epoch was not logged.'
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_TerminateOnNaN(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
+    np.random.seed(1337)
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=TRAIN_SAMPLES,
+        test_samples=TEST_SAMPLES,
+        input_shape=(INPUT_DIM,),
+        num_classes=NUM_CLASSES)
 
-      y_test = np_utils.to_categorical(y_test)
-      y_train = np_utils.to_categorical(y_train)
-      cbks = [keras.callbacks.TerminateOnNaN()]
-      model = keras.models.Sequential()
-      initializer = keras.initializers.Constant(value=1e5)
-      for _ in range(5):
-        model.add(
-            keras.layers.Dense(
-                2,
-                input_dim=INPUT_DIM,
-                activation='relu',
-                kernel_initializer=initializer))
-      model.add(keras.layers.Dense(NUM_CLASSES))
-      model.compile(loss='mean_squared_error', optimizer='rmsprop')
+    y_test = np_utils.to_categorical(y_test)
+    y_train = np_utils.to_categorical(y_train)
+    cbks = [keras.callbacks.TerminateOnNaN()]
+    model = keras.models.Sequential()
+    initializer = keras.initializers.Constant(value=1e5)
+    for _ in range(5):
+      model.add(
+          keras.layers.Dense(
+              2,
+              input_dim=INPUT_DIM,
+              activation='relu',
+              kernel_initializer=initializer))
+    model.add(keras.layers.Dense(NUM_CLASSES))
+    model.compile(loss='mean_squared_error', optimizer='rmsprop')
 
-      history = model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=20)
-      loss = history.history['loss']
-      self.assertEqual(len(loss), 1)
-      self.assertEqual(loss[0], np.inf)
+    history = model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=20)
+    loss = history.history['loss']
+    self.assertEqual(len(loss), 1)
+    self.assertTrue(np.isnan(loss[0]))
 
   @unittest.skipIf(
       os.name == 'nt',
@@ -1406,14 +1408,17 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
             callbacks=cbks,
             epochs=1)
 
-  def test_callback_params_samples(self):
-    x, y = np.ones((64, 3)), np.ones((64, 2))
-    model = testing_utils.get_small_sequential_mlp(
-        num_hidden=10, num_classes=2, input_dim=3)
+  def test_progbar_infers_steps(self):
+    x, y = np.ones((10, 1)), np.ones((10, 1))
+    data = dataset_ops.DatasetV2.from_tensor_slices((x, y)).batch(2)
+    data = data.filter(lambda x, y: True)  # Unknown cardinality.
+
+    progbar = keras.callbacks.ProgbarLogger('steps')
+    model = keras.Sequential([keras.layers.Dense(1)])
     model.compile('sgd', 'mse')
-    callback = keras.callbacks.Callback()
-    model.evaluate(x, y, callbacks=[callback])
-    self.assertEqual(callback.params['samples'], 64)
+    self.assertIsNone(progbar.target)
+    model.fit(data, epochs=2, callbacks=[progbar])
+    self.assertEqual(progbar.target, 5)
 
 
 # A summary that was emitted during a test. Fields:
diff --git a/tensorflow/python/keras/distribute/distribute_strategy_test.py b/tensorflow/python/keras/distribute/distribute_strategy_test.py
index 16f69a4410f..81609d7092c 100644
--- a/tensorflow/python/keras/distribute/distribute_strategy_test.py
+++ b/tensorflow/python/keras/distribute/distribute_strategy_test.py
@@ -950,10 +950,16 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
             optimizer='adam',
             experimental_run_tf_function=experimental_run_tf_function)
 
-      def map_fn(img, lbl, weight):
-        inputs = {'img': img, 'lbl': lbl, 'weight': weight}
-        targets = {}
-        return inputs, targets
+      if context.executing_eagerly():
+
+        def map_fn(img, lbl, weight):
+          inputs = {'img': img, 'lbl': lbl, 'weight': weight}
+          return (inputs,)
+      else:
+
+        def map_fn(img, lbl, weight):
+          inputs = {'img': img, 'lbl': lbl, 'weight': weight}
+          return inputs, {}
 
       fake_imgs = np.ones([50, 64, 64, 3], dtype=np.float32)
       fake_lbls = np.ones([50, 64, 64, 1], dtype=np.float32)
@@ -1178,7 +1184,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       dataset = dataset.repeat(100)
       dataset = dataset.batch(10)
 
-      with self.assertRaisesRegexp(ValueError, 'expected input to have shape'):
+      with self.assertRaisesRegexp(ValueError, 'incompatible with the layer'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
 
   @combinations.generate(
@@ -1776,7 +1782,9 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
           experimental_run_tf_function=experimental_run_tf_function)
       ds_history = ds_model.fit(
           x, y, validation_data=(x, y), validation_steps=2, epochs=2)
-      self.assertLen(ds_model.metrics, 1)
+      # includes stateful loss metric in eager.
+      metrics_len = 2 if context.executing_eagerly() else 1
+      self.assertLen(ds_model.metrics, metrics_len)
 
     self.assertAllClose(history.history, ds_history.history)
 
@@ -1830,7 +1838,9 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
           experimental_run_tf_function=experimental_run_tf_function)
       ds_history = ds_model.fit(
           x, y, validation_data=(x, y), validation_steps=2, epochs=2)
-      self.assertLen(ds_model.metrics, 1)
+      # includes stateful loss metric in eager.
+      metrics_len = 2 if context.executing_eagerly() else 1
+      self.assertLen(ds_model.metrics, metrics_len)
 
     self.assertAllClose(history.history, ds_history.history)
 
@@ -1870,7 +1880,9 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
           experimental_run_tf_function=experimental_run_tf_function)
       ds_history = ds_model.fit(
           x, y, validation_data=(x, y), validation_steps=2, epochs=2)
-      self.assertLen(ds_model.metrics, 1)
+      # includes stateful loss metric in eager.
+      metrics_len = 2 if context.executing_eagerly() else 1
+      self.assertLen(ds_model.metrics, metrics_len)
 
     self.assertAllClose(history.history, ds_history.history)
 
diff --git a/tensorflow/python/keras/distribute/keras_utils_test.py b/tensorflow/python/keras/distribute/keras_utils_test.py
index 2454b9cdee6..20a4f98d881 100644
--- a/tensorflow/python/keras/distribute/keras_utils_test.py
+++ b/tensorflow/python/keras/distribute/keras_utils_test.py
@@ -257,11 +257,8 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
             experimental_run_tf_function=experimental_run_tf_function)
 
       dataset = keras_test_lib.get_dataset(distribution)
-      exception_error_message = (
-          '`validation_split` argument is not supported when ')
-
       # Test with validation split
-      with self.assertRaisesRegexp(ValueError, exception_error_message):
+      with self.assertRaises(ValueError):
         model.fit(
             dataset,
             epochs=1,
@@ -272,9 +269,7 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
 
       # Test with sample weight.
       sample_weight = np.random.random((10,))
-      with self.assertRaisesRegexp(
-          ValueError, '`sample_weight` argument is not supported when.*'
-          'dataset'):
+      with self.assertRaises(ValueError):
         model.fit(
             dataset,
             epochs=1,
@@ -285,69 +280,14 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
       # Test with not specifying the `steps` argument for dataset with infinite
       # cardinality.
       dataset = dataset.repeat()
-      with self.assertRaisesRegexp(
-          ValueError, 'When passing an infinitely '
-          'repeating dataset, you must specify the '
-          '`steps_per_epoch` argument'):
+      with self.assertRaises(ValueError):
         model.fit(dataset, epochs=1, verbose=0)
-      with self.assertRaisesRegexp(
-          ValueError, 'When passing an infinitely '
-          'repeating dataset, you must specify the '
-          '`steps` argument'):
+      with self.assertRaises(ValueError):
         model.evaluate(dataset, verbose=0)
 
-      with self.assertRaisesRegexp(
-          ValueError, 'When passing an infinitely '
-          'repeating dataset, you must specify the '
-          '`steps` argument'):
+      with self.assertRaises(ValueError):
         model.predict(dataset, verbose=0)
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-          ],
-          mode=['graph', 'eager'],
-          experimental_run_tf_function=[True, False]))
-  def test_calling_with_unsupported_predefined_callbacks(
-      self, distribution, experimental_run_tf_function):
-    with self.cached_session():
-      with distribution.scope():
-        model = keras_test_lib.get_model()
-        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-        loss = 'mse'
-        metrics = ['mae']
-        model.compile(
-            optimizer,
-            loss,
-            metrics=metrics,
-            experimental_run_tf_function=experimental_run_tf_function)
-
-      dataset = keras_test_lib.get_dataset(distribution)
-
-      def schedule(_):
-        return 0.001
-
-      with self.assertRaisesRegexp(
-          ValueError, 'You must specify a Keras Optimizer V2 when '
-          'using'):
-        model.fit(
-            dataset,
-            epochs=1,
-            steps_per_epoch=2,
-            verbose=0,
-            callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
-
-      with self.assertRaisesRegexp(
-          ValueError, 'You must specify a Keras Optimizer V2 when '
-          'using'):
-        model.fit(
-            dataset,
-            epochs=1,
-            steps_per_epoch=2,
-            verbose=0,
-            callbacks=[keras.callbacks.ReduceLROnPlateau()])
-
   @combinations.generate(
       combinations.combine(
           distribution=[
diff --git a/tensorflow/python/keras/engine/BUILD b/tensorflow/python/keras/engine/BUILD
index 3ecc31905ba..47765190ff6 100644
--- a/tensorflow/python/keras/engine/BUILD
+++ b/tensorflow/python/keras/engine/BUILD
@@ -29,8 +29,6 @@ py_library(
         "training_generator.py",
         "training_utils.py",
         "training_v1.py",
-        "training_v2.py",
-        "training_v2_utils.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -428,24 +426,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "training_v2_utils_test",
-    size = "medium",
-    srcs = ["training_v2_utils_test.py"],
-    python_version = "PY3",
-    tags = [
-        "no_oss",  # TODO(b/135021748) reenable
-        "notsan",
-    ],
-    deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python/distribute:strategy_combinations",
-        "//tensorflow/python/keras",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
 tf_py_test(
     name = "network_test",
     size = "medium",
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 24d3432fb8e..c097398d90d 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -22,6 +22,7 @@ import collections
 import functools
 import itertools
 import threading
+import weakref
 
 import numpy as np
 import six
@@ -230,6 +231,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # A list of metric instances corresponding to the symbolic metric tensors
     # added using the `add_metric` API.
     self._metrics = []
+    # Ensures the same metric is not added multiple times in `MirroredStrategy`.
+    self._metrics_lock = threading.Lock()
 
     # Both graph and subclassed networks have a dtype policy. For graph
     # networks, the policy's compute and variable dtypes are ignored, but other
@@ -849,10 +852,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
           if hasattr(self, '_set_inputs') and not self.inputs:
             # Subclassed network: explicitly set metadata normally set by
             # a call to self._set_inputs().
-            # TODO(b/120997007): This should be done in Eager as well, but
-            # causes garbage collection issues because of the placeholders
-            # created on the default Keras graph.
-            self._set_inputs(inputs, outputs)
+            self._set_inputs(cast_inputs, outputs)
       else:
         # Eager execution on data tensors.
         with backend.name_scope(self._name_scope()):
@@ -863,6 +863,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
             outputs = self.call(cast_inputs, *args, **kwargs)
           self._handle_activity_regularization(inputs, outputs)
           self._set_mask_metadata(inputs, outputs, input_masks)
+          if hasattr(self, '_set_save_spec'):
+            self._set_save_spec(cast_inputs)
 
     return outputs
 
@@ -1146,7 +1148,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     collected_metrics = []
     all_layers = self._gather_unique_layers()
     for layer in all_layers:
-      collected_metrics.extend(layer._metrics)
+      with layer._metrics_lock:
+        collected_metrics.extend(layer._metrics)
     return collected_metrics
 
   @doc_controls.for_subclass_implementers
@@ -1938,20 +1941,29 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # on it, otherwise we create a new metric instance and
     # add it to the `metrics` list.
     metric_obj = getattr(value, '_metric_obj', None)
-    if metric_obj:
-      name = metric_obj.name
+    # Tensors that come from a Metric object already updated the Metric state.
+    should_update_state = not metric_obj
+    name = metric_obj.name if metric_obj else name
 
-    match = self._get_existing_metric(name)
-    if match:
-      # Tensors that come from a Metric object already updated the Metric state.
-      if not metric_obj:
-        match(value)
-      return
+    with self._metrics_lock:
+      match = self._get_existing_metric(name)
+      if match:
+        metric_obj = match
+      elif metric_obj:
+        self._metrics.append(metric_obj)
+      else:
+        from tensorflow.python.keras import metrics as metrics_mod  # pylint:disable=g-import-not-at-top
+        if aggregation is None:
+          raise ValueError(
+              '`aggregation` must be specified when passing a `Tensor` '
+              'to `add_metric`.')
+        assert aggregation is not None
+        metric_obj = metrics_mod.Mean(name=name, dtype=value.dtype)
+        self._metrics.append(metric_obj)
 
-    if not metric_obj:
-      assert aggregation is not None
-      metric_obj, _ = base_layer_utils.create_mean_metric(value, name)
-    self._metrics.append(metric_obj)
+    if should_update_state:
+      metric_obj(value)
+    return
 
   def _symbolic_add_metric(self, value, aggregation=None, name=None):
     base_layer_utils.check_graph_consistency(value, method='add_metric')
@@ -2259,7 +2271,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     layers = trackable_layer_utils.filter_empty_layer_containers(self._layers)
     # Keep track of each top-level layers' `trainable` as well as the
     # state of all of its sublayers.
-    trainable_state = {self: self.trainable}
+    trainable_state = weakref.WeakKeyDictionary()
+    trainable_state[self] = self.trainable
     for layer in layers:
       trainable_state.update(layer._get_trainable_state())
     return trainable_state
@@ -2565,10 +2578,12 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # so shouldn't be copied.
     state = self.__dict__.copy()
     state.pop('_thread_local', None)
+    state.pop('_metrics_lock', None)
     return state
 
   def __setstate__(self, state):
     state['_thread_local'] = threading.local()
+    state['_metrics_lock'] = threading.Lock()
     # Bypass Trackable logic as `__dict__` already contains this info.
     object.__setattr__(self, '__dict__', state)
 
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index 5e07f77265e..86b0689d026 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -187,7 +187,7 @@ class BaseLayerTest(keras_parameterized.TestCase):
     model.compile(rmsprop.RMSprop(0.001), loss='mse')
     self.assertEqual(model.run_eagerly, True)
     model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
-    self.assertEqual(model.outputs, [None])
+    self.assertEqual(model.outputs, None)
 
   def test_dynamic_subclassed_model_with_shape_inference(self):
 
@@ -210,8 +210,10 @@ class BaseLayerTest(keras_parameterized.TestCase):
     model = MyModel()
     self.assertEqual(model.dynamic, True)
     model.compile(rmsprop.RMSprop(0.001), loss='mse')
-    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
-    self.assertEqual(model.outputs[0].shape.as_list(), [None, 3])
+    x, y = np.random.random((2, 3)), np.random.random((2, 3))
+    model.train_on_batch(x, y)
+    outputs = model(x)
+    self.assertEqual(outputs.shape.as_list(), [2, 3])
 
   def test_deepcopy(self):
     with context.eager_mode():
@@ -331,42 +333,6 @@ class BaseLayerTest(keras_parameterized.TestCase):
     keras.backend.set_learning_phase(0)
     self.assertEqual(get_learning_phase_value(), 0)
 
-  @keras_parameterized.run_all_keras_modes
-  def test_learning_phase_freezing_for_layers_in_predict(self):
-    if not (testing_utils.should_run_eagerly() or
-            testing_utils.should_run_tf_function()):
-      self.skipTest('Predict fails to override the outer learning phase in'
-                    'the FuncGraph path.')
-
-    class LearningPhaseLayer(keras.layers.Layer):
-
-      def call(self, inputs):
-        return keras.backend.in_train_phase(
-            lambda: array_ops.ones_like(inputs),
-            lambda: array_ops.zeros_like(inputs))
-
-    def get_learning_phase_value():
-      model = keras.models.Sequential([LearningPhaseLayer(input_shape=(1,))])
-      model._run_eagerly = testing_utils.should_run_eagerly()
-      model._experimental_run_tf_function = (
-          testing_utils.should_run_tf_function())
-      return np.sum(model.predict(np.ones((1, 1))))
-
-    self.assertEqual(get_learning_phase_value(), 0)
-
-    # Test scope.
-    with keras.backend.learning_phase_scope(1):
-      self.assertEqual(get_learning_phase_value(), 0)
-
-    # The effects of the scope end after exiting it.
-    self.assertEqual(get_learning_phase_value(), 0)
-
-    # Test setting.
-    keras.backend.set_learning_phase(1)
-    self.assertEqual(get_learning_phase_value(), 0)
-    keras.backend.set_learning_phase(0)
-    self.assertEqual(get_learning_phase_value(), 0)
-
   # Cannot be enabled with `run_eagerly=True`, see b/123904578
   @test_util.run_all_in_graph_and_eager_modes
   def test_layer_can_return_variable(self):
diff --git a/tensorflow/python/keras/engine/compile_utils.py b/tensorflow/python/keras/engine/compile_utils.py
index b9241280d0f..74c6370fce6 100644
--- a/tensorflow/python/keras/engine/compile_utils.py
+++ b/tensorflow/python/keras/engine/compile_utils.py
@@ -21,9 +21,9 @@ import copy
 
 import six
 
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.keras import losses as losses_mod
 from tensorflow.python.keras import metrics as metrics_mod
-from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -35,6 +35,10 @@ class LossesContainer(object):
   """A container class for losses passed to `Model.compile`."""
 
   def __init__(self, losses, loss_weights=None, output_names=None):
+    # Keep user-supplied values untouched for recompiling and serialization.
+    self._user_losses = losses
+    self._user_loss_weights = loss_weights
+
     self._losses = losses
     self._loss_weights = loss_weights
     self._output_names = output_names
@@ -59,7 +63,7 @@ class LossesContainer(object):
     if self._output_names is None:
       # In Subclass API,  output names like 'output_1' are used for
       # `Metric` names.
-      self._output_names = create_output_names(y_pred)
+      self._output_names = create_pseudo_output_names(y_pred)
 
     # Accept a dict of losses keyed by output_name when outputs are a flat
     # list.
@@ -94,7 +98,11 @@ class LossesContainer(object):
 
     self._built = True
 
-  def __call__(self, y_true, y_pred, sample_weight=None):
+  def __call__(self,
+               y_true,
+               y_pred,
+               sample_weight=None,
+               regularization_losses=None):
     """Computes the overall loss.
 
     Arguments:
@@ -104,14 +112,19 @@ class LossesContainer(object):
         per-sample loss weights. If one Tensor is passed, it is used for all
         losses. If multiple Tensors are passed, the structure should match
         `y_pred`.
+      regularization_losses: Additional losses to be added to the total loss.
 
     Returns:
       Tuple of `(total_loss, per_output_loss_list)`
     """
+    y_true = map_to_output_names(y_pred, self._output_names, y_true)
+    sample_weight = map_to_output_names(y_pred, self._output_names,
+                                        sample_weight)
+
     if not self._built:
       self._build(y_pred)
 
-    y_true = nest.flatten(y_true)
+    y_true = nest.flatten(y_true) if y_true is not None else []
     y_pred = nest.flatten(y_pred)
 
     # TODO(omalleyt): Remove ambiguity here.
@@ -127,45 +140,47 @@ class LossesContainer(object):
     if len(sample_weight) == 1 and len(y_pred) > 1:
       sample_weight = sample_weight * len(y_pred)
 
-    loss_values = []
+    loss_values = []  # Used for gradient calculation.
+    loss_metric_values = []  # Used for loss metric calculation.
     zip_args = (y_true, y_pred, sample_weight, self._losses, self._loss_weights,
                 self._per_output_metrics)
     for y_t, y_p, sw, loss_obj, loss_weight, metric_obj in zip(*zip_args):
       if loss_obj is None:  # Ok to have no loss for an output.
         continue
 
-      y_t = math_ops.cast(y_t, y_p.dtype)
-      if sw is not None:
-        sw = math_ops.cast(sw, y_p.dtype)
-
-      # Handle Keras mask on outputs.
-      mask = getattr(y_p, '_keras_mask', None)
-      if mask is not None:
-        mask = math_ops.cast(mask, y_p.dtype)
-        if sw is not None:
-          mask, _, sw = (
-              tf_losses_utils.squeeze_or_expand_dimensions(
-                  mask, sample_weight=sw))
-          sw *= mask
-        else:
-          sw = mask
+      y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
+      sw = apply_mask(y_p, sw)
 
       loss_value = loss_obj(y_t, y_p, sample_weight=sw)
 
+      loss_metric_value = loss_value
+      # Correct for the `Mean` loss metrics counting each replica as a batch.
+      if loss_obj.reduction == losses_utils.ReductionV2.SUM:
+        loss_metric_value *= ds_context.get_strategy().num_replicas_in_sync
       if metric_obj is not None:
-        metric_obj.update_state(loss_value)
+        metric_obj.update_state(loss_metric_value)
 
       if loss_weight is not None:
         loss_value *= loss_weight
+        loss_metric_value *= loss_weight
 
       if (loss_obj.reduction == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE or
           loss_obj.reduction == losses_utils.ReductionV2.AUTO):
         loss_value = losses_utils.scale_loss_for_distribution(loss_value)
+
       loss_values.append(loss_value)
+      loss_metric_values.append(loss_metric_value)
+
+    if regularization_losses:
+      reg_loss = math_ops.add_n(regularization_losses)
+      loss_metric_values.append(reg_loss)
+      loss_values.append(losses_utils.scale_loss_for_distribution(reg_loss))
 
     if loss_values:
+      total_loss_metric_value = math_ops.add_n(loss_metric_values)
+      self._loss_metric.update_state(total_loss_metric_value)
+
       total_loss = math_ops.add_n(loss_values)
-      self._loss_metric.update_state(total_loss)
       return total_loss
     else:
       # Ok for a model to have no compiled loss.
@@ -188,7 +203,8 @@ class LossesContainer(object):
 
     loss = losses_mod.get(loss)
     if not isinstance(loss, losses_mod.Loss):
-      loss = losses_mod.LossFunctionWrapper(loss, name=loss.__name__)
+      loss_name = loss.__name__
+      loss = losses_mod.LossFunctionWrapper(loss, name=loss_name)
     loss._allow_sum_over_batch_size = True  # pylint: disable=protected-access
     return loss
 
@@ -197,6 +213,10 @@ class MetricsContainer(object):
   """A container class for metrics passed to `Model.compile`."""
 
   def __init__(self, metrics=None, weighted_metrics=None, output_names=None):
+    # Keep user-supplied values untouched for recompiling and serialization.
+    self._user_metrics = metrics
+    self._user_weighted_metrics = weighted_metrics
+
     self._metrics = metrics
     self._weighted_metrics = weighted_metrics
     self._output_names = output_names
@@ -207,22 +227,19 @@ class MetricsContainer(object):
     """Metrics created by this container."""
     if not self._built:
       return []
-    metrics = [
-        metric_obj for metric_obj in nest.flatten(self._metrics)
-        if metric_obj is not None
-    ]
-    weighted_metrics = [
-        metric_obj for metric_obj in nest.flatten(self._weighted_metrics)
-        if metric_obj is not None
-    ]
-    return metrics + weighted_metrics
+    return self._metrics_in_order
 
   def _build(self, y_pred, y_true):
     """One-time setup of metric objects."""
 
     if self._output_names is None:
       # Subclass output names like 'output_1' are used for `Metric` names.
-      self._output_names = create_output_names(y_pred)
+      self._output_names = create_pseudo_output_names(y_pred)
+
+    # If a single metric or flat list of metrics, apply to all outputs.
+    self._metrics = self._maybe_broadcast(self._metrics, y_pred)
+    self._weighted_metrics = self._maybe_broadcast(self._weighted_metrics,
+                                                   y_pred)
 
     # Accept a dict of metrics keyed by output_name when outputs are a flat
     # list.
@@ -231,10 +248,13 @@ class MetricsContainer(object):
     self._weighted_metrics = map_to_output_names(y_pred, self._output_names,
                                                  self._weighted_metrics)
 
-    # If a single metric is supplied, apply to all outputs.
-    self._metrics = self._maybe_broadcast(self._metrics, y_pred)
-    self._weighted_metrics = self._maybe_broadcast(self._weighted_metrics,
-                                                   y_pred)
+    # Standardize on tuple since `tf.data` turns lists into `Tensor`s.
+    # pylint: disable=protected-access
+    y_pred = nest._list_to_tuple(y_pred)
+    y_true = nest._list_to_tuple(y_true)
+    self._metrics = nest._list_to_tuple(self._metrics)
+    self._weighted_metrics = nest._list_to_tuple(self._weighted_metrics)
+    # pylint: enable=protected-access
 
     # Convert to `Metric` objects, potentially disambiguating based on output
     # properties.
@@ -252,6 +272,17 @@ class MetricsContainer(object):
     # Assumes metrics, weighted_metrics have been flattened up to outputs.
     self._set_metric_names()
 
+    # Cache the flat order needed when returning metrics, for backwards compat.
+    self._metrics_in_order = []
+    for output_metrics, output_weighted_metrics in zip(self._metrics,
+                                                       self._weighted_metrics):
+      for m in nest.flatten(output_metrics):
+        if m is not None:
+          self._metrics_in_order.append(m)
+      for wm in nest.flatten(output_weighted_metrics):
+        if wm is not None:
+          self._metrics_in_order.append(wm)
+
     self._built = True
 
   def _set_metric_names(self):
@@ -277,9 +308,13 @@ class MetricsContainer(object):
         if wm is None:
           continue
         if is_multi_output:
-          wm._name = output_name + '_' + wm._name
-        if wm._name in metric_names:
+          if output_name + '_' + wm._name in metric_names:
+            wm._name = output_name + '_weighted_' + wm._name
+          else:
+            wm._name = output_name + '_' + wm._name
+        elif wm._name in metric_names:
           wm._name = 'weighted_' + wm._name
+
         if wm._name in metric_names:
           raise ValueError('Found two metrics with the same name: {}'.format(
               wm._name))
@@ -288,9 +323,16 @@ class MetricsContainer(object):
 
   def update_state(self, y_true, y_pred, sample_weight=None):
     """Updates the state of per-output metrics."""
-    flat_y_true = nest.flatten(y_true)
+    y_true = map_to_output_names(y_pred, self._output_names, y_true)
+    sample_weight = map_to_output_names(y_pred, self._output_names,
+                                        sample_weight)
+
+    flat_y_true = nest.flatten(y_true) if y_true is not None else []
     flat_y_pred = nest.flatten(y_pred)
 
+    if not flat_y_true:
+      return  # Handle case where no targets are passed.
+
     # TODO(omalleyt): Remove ambiguity here (see LossesContainer).
     if len(flat_y_true) == 1 and len(flat_y_pred) > 1:
       y_true = nest.map_structure(lambda _: flat_y_true[0], y_pred)
@@ -311,21 +353,8 @@ class MetricsContainer(object):
     zip_args = (y_true, y_pred, sample_weight, self._metrics,
                 self._weighted_metrics)
     for y_t, y_p, sw, metric_objs, weighted_metric_objs in zip(*zip_args):
-      y_t = math_ops.cast(y_t, y_p.dtype)
-      if sw is not None:
-        sw = math_ops.cast(sw, y_p.dtype)
-
-      # Handle Keras mask on outputs.
-      mask = getattr(y_p, '_keras_mask', None)
-      if mask is not None:
-        mask = math_ops.cast(mask, y_p.dtype)
-        if sw is not None:
-          mask, _, sw = (
-              tf_losses_utils.squeeze_or_expand_dimensions(
-                  mask, sample_weight=sw))
-          sw *= mask
-        else:
-          sw = mask
+      y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
+      sw = apply_mask(y_p, sw)
 
       for metric_obj in metric_objs:
         if metric_obj is None:
@@ -339,7 +368,7 @@ class MetricsContainer(object):
 
   def _get_metric_objects(self, metrics, y_t, y_p):
     """Convert user-supplied metrics to `Metric` objects."""
-    metrics = generic_utils.to_list(metrics)
+    metrics = nest.flatten(metrics)
     return [self._get_metric_object(m, y_t, y_p) for m in metrics]
 
   def _get_metric_object(self, metric, y_t, y_p):
@@ -399,31 +428,47 @@ class MetricsContainer(object):
     return metric_obj
 
   def _maybe_broadcast(self, metrics, y_pred):
-    """If a single Metric is supplied, applies it to all outputs."""
+    """If a flat list of Metrics is supplied, apply them to all outputs."""
 
     def _should_broadcast(metrics):
-      single_valued_list = (
-          isinstance(metrics, list) and len(metrics) == 1 and
-          not nest.is_sequence(metrics[0]))
-      # I.e. `metrics=['accuracy']` or `metrics='accuracy'`.
-      # In this special case we apply the metric to each output.
-      return not nest.is_sequence(metrics) or single_valued_list
-
-    def _copy(metric):
-      if isinstance(metric, metrics_mod.Metric):
-        return metrics_mod.Metric.from_config(metric.get_config())
-      return metric
+      # e.g. 'mse'.
+      if not nest.is_sequence(metrics):
+        return True
+      # e.g. ['mse'] or ['mse', 'mae'].
+      return (isinstance(metrics, (list, tuple)) and
+              not any(nest.is_sequence(m) for m in metrics))
 
     if _should_broadcast(metrics):
-      metric = metrics[0] if isinstance(metrics, list) else metrics
-      return nest.map_structure(lambda _: _copy(metric), y_pred)
+      copy_metrics = len(nest.flatten(y_pred)) > 1
+
+      def _maybe_copy(m):
+        if copy_metrics and isinstance(m, metrics_mod.Metric):
+          return m.__class__.from_config(m.get_config())
+        return m
+
+      metrics = nest.flatten(metrics)
+      return nest.map_structure(lambda _: [_maybe_copy(m) for m in metrics],
+                                y_pred)
+
     return metrics
 
 
-def create_output_names(y_pred):
-  """Creates output names for subclassed Model outputs.
+def create_pseudo_output_names(outputs):
+  """Create pseudo output names for a subclassed Model."""
+  return _create_pseudo_names(outputs, prefix='output_')
 
-  These names are used for naming `Metric`s.
+
+def create_pseudo_input_names(inputs):
+  """Create pseudo input names for a subclassed Model."""
+  return _create_pseudo_names(inputs, prefix='input_')
+
+
+def _create_pseudo_names(tensors, prefix):
+  """Creates pseudo {input | output} names for subclassed Models.
+
+  Warning: this function should only be used to define default
+  names for `Metics` and `SavedModel`. No other use cases should
+  rely on a `Model`'s input or output names.
 
   Example with dict:
 
@@ -436,10 +481,11 @@ def create_output_names(y_pred):
   `['output_1', 'output_2']`
 
   Arguments:
-    y_pred: `Model`'s outputs.
+    tensors: `Model`'s outputs or inputs.
+    prefix: 'output_' for outputs, 'input_' for inputs.
 
   Returns:
-    Flattened list of output names.
+    Flattened list of pseudo names.
   """
 
   def one_index(ele):
@@ -448,18 +494,18 @@ def create_output_names(y_pred):
       return ele + 1
     return ele
 
-  flat_paths = list(nest.yield_flat_paths(y_pred))
+  flat_paths = list(nest.yield_flat_paths(tensors))
   flat_paths = nest.map_structure(one_index, flat_paths)
-  output_names = []
+  names = []
   for path in flat_paths:
     if not path:
-      output_name = 'output_1'
+      name = prefix + '1'  # Single output.
     else:
-      output_name = '_'.join(str(p) for p in path)
+      name = '_'.join(str(p) for p in path)
       if isinstance(path[0], int):
-        output_name = 'output_' + output_name
-    output_names.append(output_name)
-  return output_names
+        name = prefix + name
+    names.append(name)
+  return names
 
 
 def map_to_output_names(y_pred, output_names, struct):
@@ -473,7 +519,7 @@ def map_to_output_names(y_pred, output_names, struct):
 
   For the Functional API, the output names are the names of the
   last layer of each output. For the Subclass API, the output names
-  are determined by `create_output_names` (For example:
+  are determined by `create_pseudo_output_names` (For example:
   `['output_1', 'output_2']` for a list of outputs).
 
   This mapping preserves backwards compatibility for `compile` and
@@ -492,17 +538,52 @@ def map_to_output_names(y_pred, output_names, struct):
   outputs_are_flat_list = (
       isinstance(y_pred, (list, tuple)) and
       not any(nest.is_sequence(y_p) for y_p in y_pred))
-  if not outputs_are_flat_list:
-    # In this case, `y_pred` and `struct` must have the same structure.
+  single_output = not nest.is_sequence(y_pred)
+
+  if (single_output or outputs_are_flat_list) and isinstance(struct, dict):
+    output_names = output_names or create_pseudo_output_names(y_pred)
+    struct = copy.copy(struct)
+    new_struct = [struct.pop(name, None) for name in output_names]
+    if struct:
+      raise ValueError('Found unexpected keys that do not correspond '
+                       'to any Model output: {}. Expected: {}'.format(
+                           struct.keys(), output_names))
+    if len(new_struct) == 1:
+      return new_struct[0]
+    return new_struct
+  else:
     return struct
 
-  if not isinstance(struct, dict):
-    return struct
 
-  struct = copy.copy(struct)
-  new_struct = [struct.pop(name, None) for name in output_names]
-  if struct:
-    raise ValueError('Found unexpected keys that do not correspond '
-                     'to any Model output: {}. Expected: {}'.format(
-                         struct.keys(), output_names))
-  return new_struct
+def match_dtype_and_rank(y_t, y_p, sw):
+  """Match dtype and rank of predictions."""
+  # Rank.
+  y_t_rank = len(y_t.shape)
+  y_p_rank = len(y_p.shape)
+  if y_t_rank == 1 and y_p_rank == 2:
+    y_t = array_ops.expand_dims_v2(y_t, axis=-1)
+  if sw is not None:
+    sw_rank = len(sw.shape)
+    if sw_rank == 1 and y_p_rank == 2:
+      sw = array_ops.expand_dims_v2(sw, axis=-1)
+
+  # Dtype.
+  y_t = math_ops.cast(y_t, y_p.dtype)
+  if sw is not None:
+    sw = math_ops.cast(sw, y_p.dtype)
+  return y_t, y_p, sw
+
+
+def apply_mask(y_p, sw):
+  """Applies any mask on predictions to sample weights."""
+  # Handle Keras mask on outputs.
+  mask = getattr(y_p, '_keras_mask', None)
+  if mask is not None:
+    mask = math_ops.cast(mask, y_p.dtype)
+    if sw is not None:
+      mask, _, sw = (
+          tf_losses_utils.squeeze_or_expand_dimensions(mask, sample_weight=sw))
+      sw *= mask
+    else:
+      sw = mask
+  return sw
diff --git a/tensorflow/python/keras/engine/compile_utils_test.py b/tensorflow/python/keras/engine/compile_utils_test.py
index 58d92d41e1f..f888797746d 100644
--- a/tensorflow/python/keras/engine/compile_utils_test.py
+++ b/tensorflow/python/keras/engine/compile_utils_test.py
@@ -234,29 +234,37 @@ class MetricsContainerTest(keras_parameterized.TestCase):
 
   def test_list_of_metrics_list_of_outputs(self):
     metric_container = compile_utils.MetricsContainer(
-        metrics=['mse', 'mae'],
+        metrics=['mse', 'mae'],  # Should broadcast to both outputs.
         weighted_metrics=['accuracy'])  # Should broadcast to both outputs.
 
     y_t = [array_ops.ones((10, 1)), array_ops.zeros((10, 1))]
     y_p = [array_ops.ones((10, 1)), 2 * array_ops.ones((10, 1))]
     sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
     metric_container.update_state(y_t, y_p, sample_weight=sw)
-    self.assertLen(metric_container.metrics, 4)
+    self.assertLen(metric_container.metrics, 6)
 
     mse_metric = metric_container.metrics[0]
     self.assertEqual(mse_metric.name, 'output_1_mse')
     self.assertEqual(mse_metric.result().numpy(), 0.)
 
-    mae_metric = metric_container.metrics[1]
-    self.assertEqual(mae_metric.name, 'output_2_mae')
-    self.assertEqual(mae_metric.result().numpy(), 2.)
+    mse_metric = metric_container.metrics[1]
+    self.assertEqual(mse_metric.name, 'output_1_mae')
+    self.assertEqual(mse_metric.result().numpy(), 0.)
 
     acc_metric_1 = metric_container.metrics[2]
     self.assertEqual(acc_metric_1.name, 'output_1_accuracy')
     self.assertEqual(acc_metric_1.result().numpy(), 1.)
     self.assertEqual(acc_metric_1._fn, metrics_mod.binary_accuracy)
 
-    acc_metric_2 = metric_container.metrics[3]
+    mae_metric = metric_container.metrics[3]
+    self.assertEqual(mae_metric.name, 'output_2_mse')
+    self.assertEqual(mae_metric.result().numpy(), 4.)
+
+    mae_metric = metric_container.metrics[4]
+    self.assertEqual(mae_metric.name, 'output_2_mae')
+    self.assertEqual(mae_metric.result().numpy(), 2.)
+
+    acc_metric_2 = metric_container.metrics[5]
     self.assertEqual(acc_metric_2.name, 'output_2_accuracy')
     self.assertEqual(acc_metric_2.result().numpy(), 0.)
     self.assertEqual(acc_metric_2._fn, metrics_mod.binary_accuracy)
@@ -281,16 +289,16 @@ class MetricsContainerTest(keras_parameterized.TestCase):
     self.assertEqual(mse_metric.name, 'out1_mse')
     self.assertEqual(mse_metric.result().numpy(), 0.)
 
-    mae_metric = metric_container.metrics[1]
+    weighted_mse_metric = metric_container.metrics[1]
+    self.assertEqual(weighted_mse_metric.name, 'out1_weighted_mse')
+    self.assertEqual(weighted_mse_metric.result().numpy(), 0.)
+
+    mae_metric = metric_container.metrics[2]
     self.assertEqual(mae_metric.name, 'out2_mae')
     self.assertEqual(mae_metric.result().numpy(), 2.)
 
-    weighted_mse_metric = metric_container.metrics[2]
-    self.assertEqual(weighted_mse_metric.name, 'weighted_out1_mse')
-    self.assertEqual(weighted_mse_metric.result().numpy(), 0.)
-
     weighted_mae_metric = metric_container.metrics[3]
-    self.assertEqual(weighted_mae_metric.name, 'weighted_out2_mae')
+    self.assertEqual(weighted_mae_metric.name, 'out2_weighted_mae')
     self.assertEqual(weighted_mae_metric.result().numpy(), 2.)
 
   def test_metric_partial_dict_with_output_names(self):
@@ -355,14 +363,14 @@ class MetricsContainerTest(keras_parameterized.TestCase):
     self.assertEqual(a_mae_metric.name, 'a_mae')
     self.assertEqual(a_mae_metric.result().numpy(), 1.)
 
-    b_1_mse_metric = metric_container.metrics[1]
-    self.assertEqual(b_1_mse_metric.name, 'b_1_mse')
-    self.assertEqual(b_1_mse_metric.result().numpy(), 4.)
-
-    weighted_a_mae_metric = metric_container.metrics[2]
+    weighted_a_mae_metric = metric_container.metrics[1]
     self.assertEqual(weighted_a_mae_metric.name, 'a_mse')
     self.assertEqual(weighted_a_mae_metric.result().numpy(), 1.)
 
+    b_1_mse_metric = metric_container.metrics[2]
+    self.assertEqual(b_1_mse_metric.name, 'b_1_mse')
+    self.assertEqual(b_1_mse_metric.result().numpy(), 4.)
+
   def test_crossentropy(self):
     metric_container = compile_utils.MetricsContainer('crossentropy')
     y_t, y_p = array_ops.ones((10, 1)), array_ops.ones((10, 1))
@@ -422,6 +430,29 @@ class MetricsContainerTest(keras_parameterized.TestCase):
     self.assertEqual(weighted_mae_metric.name, 'weighted_mae')
     self.assertEqual(weighted_mae_metric.result().numpy(), 0.)
 
+  def test_broadcast_metrics_to_dict(self):
+    metric_container = compile_utils.MetricsContainer(metrics=['mae'])
+
+    y_p = {'output': ops.convert_to_tensor([[0], [1], [2]])}
+    y_t = {'output': ops.convert_to_tensor([[1], [2], [3]])}
+    metric_container.update_state(y_t, y_p)
+
+    mae_metric = metric_container.metrics[0]
+    self.assertEqual(mae_metric.name, 'mae')
+    self.assertEqual(mae_metric.result().numpy(), 1.)
+
+  def test_broadcast_metrics_to_dict_with_output_names(self):
+    metric_container = compile_utils.MetricsContainer(
+        metrics=['mae'], output_names=['output'])
+
+    y_p = ops.convert_to_tensor([[0], [1], [2]])
+    y_t = {'output': ops.convert_to_tensor([[1], [2], [3]])}
+    metric_container.update_state(y_t, y_p)
+
+    mae_metric = metric_container.metrics[0]
+    self.assertEqual(mae_metric.name, 'mae')
+    self.assertEqual(mae_metric.result().numpy(), 1.)
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/keras/engine/data_adapter.py b/tensorflow/python/keras/engine/data_adapter.py
index d040a1fbdaa..3fc66d05b6f 100644
--- a/tensorflow/python/keras/engine/data_adapter.py
+++ b/tensorflow/python/keras/engine/data_adapter.py
@@ -36,6 +36,9 @@ from tensorflow.python.distribute import distribution_strategy_context as ds_con
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import smart_cond
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework.ops import composite_tensor
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import training_utils
@@ -211,6 +214,15 @@ class DataAdapter(object):
     """Returns whether a new iterator should be created every epoch."""
     raise NotImplementedError
 
+  def get_samples(self):
+    """Returns number of samples in the data, or `None`."""
+    if not self.get_size() or not self.batch_size():
+      return None
+    total_sample = self.get_size() * self.batch_size()
+    if self.has_partial_batch():
+      total_sample -= (self.batch_size() - self.partial_batch_size())
+    return total_sample
+
 
 class TensorLikeDataAdapter(DataAdapter):
   """Adapter that handles Tensor-like objects, e.g. EagerTensor and NumPy."""
@@ -245,25 +257,15 @@ class TensorLikeDataAdapter(DataAdapter):
                shuffle=False,
                **kwargs):
     super(TensorLikeDataAdapter, self).__init__(x, y, **kwargs)
-    x = _process_numpy_inputs(x)
-    y = _process_numpy_inputs(y)
-    sample_weights = _process_numpy_inputs(sample_weights)
+    x, y, sample_weights = _process_tensorlike((x, y, sample_weights))
     sample_weight_modes = broadcast_sample_weight_modes(
         sample_weights, sample_weight_modes)
 
     # If sample_weights are not specified for an output use 1.0 as weights.
-    (sample_weights, any_sample_weight, _
-    ) = training_utils.handle_partial_sample_weights(
+    (sample_weights, _, _) = training_utils.handle_partial_sample_weights(
         y, sample_weights, sample_weight_modes, check_all_flat=True)
 
-    if y is not None and any_sample_weight:
-      inputs = (x, y, sample_weights)
-    elif y is not None:
-      # Sample weight is only needed for training, so if y is None, then
-      # sample_weight is ignored.
-      inputs = (x, y)
-    else:
-      inputs = (x,)
+    inputs = pack_x_y_sample_weight(x, y, sample_weights)
 
     num_samples = set(int(i.shape[0]) for i in nest.flatten(inputs))
     if len(num_samples) > 1:
@@ -276,13 +278,9 @@ class TensorLikeDataAdapter(DataAdapter):
     num_samples = num_samples.pop()
 
     # If batch_size is not passed but steps is, calculate from the input data.
-    if steps and not batch_size:
-      batch_size = int(math.ceil(num_samples / steps))
-
+    # Default to 32 for backwards compat.
     if not batch_size:
-      raise ValueError(
-          "`batch_size` or `steps` is required for `Tensor` or `NumPy`"
-          " input data.")
+      batch_size = int(math.ceil(num_samples / steps)) if steps else 32
 
     self._size = int(math.ceil(num_samples / batch_size))
     self._batch_size = batch_size
@@ -557,25 +555,15 @@ class CompositeTensorDataAdapter(DataAdapter):
                shuffle=False,
                **kwargs):
     super(CompositeTensorDataAdapter, self).__init__(x, y, **kwargs)
-    x = _process_numpy_inputs(x)
-    y = _process_numpy_inputs(y)
-    sample_weights = _process_numpy_inputs(sample_weights)
+    x, y, sample_weights = _process_tensorlike((x, y, sample_weights))
     sample_weight_modes = broadcast_sample_weight_modes(
         sample_weights, sample_weight_modes)
 
     # If sample_weights are not specified for an output use 1.0 as weights.
-    (sample_weights, any_sample_weight, _
-    ) = training_utils.handle_partial_sample_weights(
+    (sample_weights, _, _) = training_utils.handle_partial_sample_weights(
         y, sample_weights, sample_weight_modes, check_all_flat=True)
 
-    if y is not None and any_sample_weight:
-      inputs = (x, y, sample_weights)
-    elif y is not None:
-      # Sample weight is only needed for training, so if y is None, then
-      # sample_weight is ignored.
-      inputs = (x, y)
-    else:
-      inputs = (x,)
+    inputs = pack_x_y_sample_weight(x, y, sample_weights)
 
     dataset = dataset_ops.DatasetV2.from_tensor_slices(inputs)
     num_samples = int(nest.flatten(x)[0].shape[0])
@@ -583,13 +571,9 @@ class CompositeTensorDataAdapter(DataAdapter):
       dataset = dataset.shuffle(num_samples)
 
     # If batch_size is not passed but steps is, calculate from the input data.
-    if steps and not batch_size:
-      batch_size = int(math.ceil(num_samples/steps))
-
+    # Default to 32 for backwards compat.
     if not batch_size:
-      raise ValueError(
-          "`batch_size` or `steps` is required for `Tensor` or `NumPy`"
-          " input data.")
+      batch_size = int(math.ceil(num_samples / steps)) if steps else 32
 
     dataset = dataset.batch(batch_size)
     self._size = int(math.ceil(num_samples / batch_size))
@@ -648,7 +632,6 @@ class ListsOfScalarsDataAdapter(DataAdapter):
                sample_weight_modes=None,
                batch_size=None,
                shuffle=False,
-               standardize_function=None,
                **kwargs):
     super(ListsOfScalarsDataAdapter, self).__init__(x, y, **kwargs)
     x = np.asarray(x)
@@ -659,10 +642,6 @@ class ListsOfScalarsDataAdapter(DataAdapter):
     sample_weight_modes = broadcast_sample_weight_modes(
         sample_weights, sample_weight_modes)
 
-    if standardize_function is not None:
-      x, y, sample_weights = standardize_function(
-          x=x, y=y, sample_weight=sample_weights)
-
     self._internal_adapter = TensorLikeDataAdapter(
         x,
         y=y,
@@ -703,32 +682,22 @@ class DatasetAdapter(DataAdapter):
                y=None,
                sample_weights=None,
                steps=None,
-               standardize_function=None,
                **kwargs):
     super(DatasetAdapter, self).__init__(x, y, **kwargs)
-    if not is_none_or_empty(y):
-      raise ValueError("`y` argument is not supported when using "
-                       "dataset as input.")
-    if not is_none_or_empty(sample_weights):
-      raise ValueError("`sample_weight` argument is not supported when using "
-                       "dataset as input.")
-
-    if standardize_function is not None:
-      x = standardize_function(x)
-
-    # Note that the dataset instance is immutable, its fine to reusing the user
+    # Note that the dataset instance is immutable, its fine to reuse the user
     # provided dataset.
     self._dataset = x
 
     # The user-provided steps.
     self._user_steps = steps
 
+    self._validate_args(y, sample_weights, steps)
+
   def get_dataset(self):
     return self._dataset
 
   def get_size(self):
-    # The size of dataset is unknown, unless its fully consumed.
-    return None
+    return  # Inferred in `DataHandler`.
 
   def batch_size(self):
     return None
@@ -746,6 +715,21 @@ class DatasetAdapter(DataAdapter):
     return (self._user_steps is None or
             cardinality.cardinality(self._dataset).numpy() == self._user_steps)
 
+  def _validate_args(self, y, sample_weights, steps):
+    """Validates `__init__` arguments."""
+    # Arguments that shouldn't be passed.
+    if not is_none_or_empty(y):
+      raise ValueError("`y` argument is not supported when using "
+                       "dataset as input.")
+    if not is_none_or_empty(sample_weights):
+      raise ValueError("`sample_weight` argument is not supported when using "
+                       "dataset as input.")
+
+    size = cardinality.cardinality(self._dataset).numpy()
+    if size == cardinality.INFINITE and steps is None:
+      raise ValueError("When providing an infinite dataset, you must specify "
+                       "the number of steps to run.")
+
 
 class GeneratorDataAdapter(DataAdapter):
   """Adapter that handles python generators and iterators."""
@@ -756,8 +740,14 @@ class GeneratorDataAdapter(DataAdapter):
             and hasattr(x, "__iter__")
             and not isinstance(x, data_utils.Sequence))
 
-  def __init__(self, x, y=None, sample_weights=None, standardize_function=None,
-               workers=1, use_multiprocessing=False, max_queue_size=10,
+  def __init__(self,
+               x,
+               y=None,
+               sample_weights=None,
+               workers=1,
+               use_multiprocessing=False,
+               max_queue_size=10,
+               model=None,
                **kwargs):
     # Generators should never shuffle as exhausting the generator in order to
     # shuffle the batches is inefficient.
@@ -769,115 +759,75 @@ class GeneratorDataAdapter(DataAdapter):
     if not is_none_or_empty(sample_weights):
       raise ValueError("`sample_weight` argument is not supported when using "
                        "python generator as input.")
+
     super(GeneratorDataAdapter, self).__init__(x, y, **kwargs)
 
     # Since we have to know the dtype of the python generator when we build the
     # dataset, we have to look at a batch to infer the structure.
     peek, x = self._peek_and_restore(x)
     assert_not_namedtuple(peek)
+    peek = self._standardize_batch(peek)
+    peek = _process_tensorlike(peek)
 
-    (peek, wrap_in_tuple, elements_to_keep, partial_sample_weight,
-     sample_weight_modes, nested_shape, nested_dtypes
-    ) = self._canonicalize_peek(peek, kwargs.get("sample_weight_modes"))
+    # Need to build the Model on concrete input shapes.
+    if model is not None and not model.built:
+      concrete_x, _, _ = unpack_x_y_sample_weight(peek)
+      model.distribute_strategy.experimental_run_v2(
+          lambda x: model(x, training=False), args=(concrete_x,))
+
+    self._first_batch_size = int(nest.flatten(peek)[0].shape[0])
+
+    def _get_dynamic_shape(t):
+      shape = t.shape
+      # Unknown number of dimensions, `as_list` cannot be called.
+      if shape.rank is None:
+        return shape
+      return tensor_shape.TensorShape([None for _ in shape.as_list()])
+
+    output_shapes = nest.map_structure(_get_dynamic_shape, peek)
+    output_types = nest.map_structure(lambda t: t.dtype, peek)
 
     # Note that dataset API takes a callable that creates a generator object,
     # rather than generator itself, which is why we define a function here.
-    generator_fn = self._make_callable(x, workers, use_multiprocessing,
-                                       max_queue_size)
+    generator_fn = self._handle_multiprocessing(x, workers, use_multiprocessing,
+                                                max_queue_size)
 
-    generator_fn = self._make_bridging_callable(
-        generator_fn, wrap_in_tuple, peek, elements_to_keep,
-        partial_sample_weight, sample_weight_modes)
+    def wrapped_generator():
+      for data in generator_fn():
+        yield self._standardize_batch(data)
 
     dataset = dataset_ops.DatasetV2.from_generator(
-        generator_fn, nested_dtypes, output_shapes=nested_shape)
-
-    if standardize_function is not None:
-      dataset = standardize_function(dataset)
+        wrapped_generator, output_types, output_shapes=output_shapes)
 
     if workers == 1 and not use_multiprocessing:
       dataset = dataset.prefetch(1)
 
     self._dataset = dataset
 
-  def _canonicalize_peek(self, peek, sample_weight_modes):
-    """Map the peeked batch into a regular form.
+  def _standardize_batch(self, data):
+    """Standardizes a batch output by a generator."""
+    # Removes `None`s.
+    x, y, sample_weight = unpack_x_y_sample_weight(data)
+    data = pack_x_y_sample_weight(x, y, sample_weight)
 
-    This function serves two purposes. First, it determines if per-batch
-    transformations are needed. Second, it extracts the structure to be used
-    by Dataset.from_generator.
+    data = nest._list_to_tuple(data)  # pylint: disable=protected-access
 
-    Args:
-      peek: The first batch of the user's data
-      sample_weight_modes: Optional structure indicating how to handle sample
-        weights. If it is a string, it will be mapped to match the target
-        structure.
+    def _convert_dtype(t):
+      if (isinstance(t, np.ndarray) and issubclass(t.dtype.type, np.floating)):
+        return np.array(t, dtype=backend.floatx())
+      return t
 
-    Returns:
-      An updated peek and various inspection results.
-    """
-    wrap_in_tuple = False
-    if not isinstance(peek, tuple):
-      peek, wrap_in_tuple = (peek,), True
-
-    if len(peek) not in (1, 2, 3):
-      raise ValueError(
-          "Output of generator should be a tuple of 1 or 2 or 3 elements: "
-          "(input,) or (input, target) or (input, target, sample_weights). "
-          "Received {}".format(peek))
-
-    x_peek, y_peek, sample_weights_peek = list(peek) + [None] * (3 - len(peek))
-
-    any_sample_weight, partial_sample_weight = False, False
-    sample_weight_modes = broadcast_sample_weight_modes(
-        sample_weights_peek if sample_weights_peek is not None else y_peek,
-        sample_weight_modes)
-
-    if len(peek) == 3:
-      (sample_weights_peek, any_sample_weight, partial_sample_weight
-      ) = training_utils.handle_partial_sample_weights(
-          y_peek, sample_weights_peek, sample_weight_modes, check_all_flat=True)
-      peek = (x_peek, y_peek, sample_weights_peek)
-
-    # Users often return None for fields which are not used. For instance:
-    # (x, y, None) to indicate no sample weights.
-    if len(peek) >= 2 and y_peek is None:
-      if any_sample_weight:
-        raise ValueError("Found sample weights but no targets\n{}".format(peek))
-      elements_to_keep = 1
-    elif len(peek) == 3 and not any_sample_weight:
-      elements_to_keep = 2
-    else:
-      elements_to_keep = len(peek)
-
-    def dynamic_shape_like(t):
-      return tuple(None for _ in t.shape)
-
-    def convert_for_inspection(t):
-      if getattr(t, "shape", None) and getattr(t, "dtype", None):
-        return t
-      return np.array(t, dtype=backend.floatx())
-
-    canonicalized_peek = nest._list_to_tuple(  # pylint: disable=protected-access
-        nest.map_structure(convert_for_inspection, peek[:elements_to_keep]))
-    nested_dtypes = nest.map_structure(lambda t: t.dtype, canonicalized_peek)
-    nested_shape = nest.map_structure(dynamic_shape_like, canonicalized_peek)
-
-    try:
-      self._first_batch_size = int(nest.flatten(canonicalized_peek)[0].shape[0])
-    except IndexError:
-      raise IndexError("Could not infer batch size from: {}".format(peek))
-
-    return (peek, wrap_in_tuple, elements_to_keep, partial_sample_weight,
-            sample_weight_modes, nested_shape, nested_dtypes)
+    data = nest.map_structure(_convert_dtype, data)
+    return data
 
   @staticmethod
   def _peek_and_restore(x):
     peek = next(x)
     return peek, itertools.chain([peek], x)
 
-  def _make_callable(self, x, workers, use_multiprocessing, max_queue_size):
-    """Create a callable, and possibly include an Enqueuer."""
+  def _handle_multiprocessing(self, x, workers, use_multiprocessing,
+                              max_queue_size):
+    """Create a callable, possibly including an Enqueuer."""
     if workers > 1 or (workers > 0 and use_multiprocessing):
       if use_multiprocessing:
         logging.warning(
@@ -893,44 +843,6 @@ class GeneratorDataAdapter(DataAdapter):
       generator_fn = lambda: x
     return generator_fn
 
-  @staticmethod
-  def _make_bridging_callable(
-      generator_fn, wrap_in_tuple, peek, elements_to_keep,
-      partial_sample_weight, sample_weight_modes):
-    """Optional compatibility layer between user's data and Dataset."""
-    must_prune_nones = (elements_to_keep != len(peek))
-    try:
-      nest.assert_same_structure(peek, nest._list_to_tuple(peek))  # pylint: disable=protected-access
-      must_extract_lists = False
-    except TypeError:
-      must_extract_lists = True
-
-    # No additional transformations are needed.
-    if not (wrap_in_tuple or must_extract_lists or must_prune_nones or
-            partial_sample_weight):
-      return generator_fn
-
-    def wrapped_generator():
-      """Remove Nones and lists before invoking Dataset.from_generator."""
-      for batch in generator_fn():
-        if wrap_in_tuple:
-          batch = (batch,)
-
-        if must_extract_lists:
-          batch = nest._list_to_tuple(batch)  # pylint: disable=protected-access
-
-        if must_prune_nones:
-          batch = batch[:elements_to_keep]
-
-        if partial_sample_weight:
-          sample_weights, _, _ = training_utils.handle_partial_sample_weights(
-              batch[1], batch[2], sample_weight_modes, check_all_flat=False)
-          batch = batch[:2] + (sample_weights,)
-
-        yield batch
-
-    return wrapped_generator
-
   def get_dataset(self):
     return self._dataset
 
@@ -960,31 +872,40 @@ class KerasSequenceAdapter(GeneratorDataAdapter):
   def can_handle(x, y=None):
     return isinstance(x, data_utils.Sequence)
 
-  def __init__(self, x, y=None, sample_weights=None, standardize_function=None,
-               shuffle=False, workers=1, use_multiprocessing=False,
-               max_queue_size=10, **kwargs):
+  def __init__(self,
+               x,
+               y=None,
+               sample_weights=None,
+               shuffle=False,
+               workers=1,
+               use_multiprocessing=False,
+               max_queue_size=10,
+               model=None,
+               **kwargs):
     if not is_none_or_empty(y):
       raise ValueError("`y` argument is not supported when using "
                        "`keras.utils.Sequence` as input.")
     if not is_none_or_empty(sample_weights):
       raise ValueError("`sample_weight` argument is not supported when using "
                        "`keras.utils.Sequence` as input.")
+
     self._size = len(x)
     self._shuffle_sequence = shuffle
     super(KerasSequenceAdapter, self).__init__(
         x,
-        standardize_function=standardize_function,
         shuffle=False,  # Shuffle is handed in the _make_callable override.
         workers=workers,
         use_multiprocessing=use_multiprocessing,
         max_queue_size=max_queue_size,
+        model=model,
         **kwargs)
 
   @staticmethod
   def _peek_and_restore(x):
     return x[0], x
 
-  def _make_callable(self, x, workers, use_multiprocessing, max_queue_size):
+  def _handle_multiprocessing(self, x, workers, use_multiprocessing,
+                              max_queue_size):
     if workers > 1 or (workers > 0 and use_multiprocessing):
       def generator_fn():
         enqueuer = data_utils.OrderedEnqueuer(
@@ -1051,37 +972,34 @@ def _type_name(x):
   return str(type(x))
 
 
-def _process_numpy_inputs(inputs):
-  """Process numpy array inputs.
+def _process_tensorlike(inputs):
+  """Process tensor-like inputs.
 
-  For numpy inputs, it is possible to be single numpy array, or list/dict of
-  them. They could also be preprocessed by other lib to match with the order
-  of position for the model. The result here should be something that can be
-  used to build dataset.
+  This function:
+
+  (1) Converts `Numpy` arrays to `Tensor`s.
+  (2) Converts `Scipy` sparse matrices to `SparseTensor`s.
+  (2) Converts `list`s to `tuple`s (for `tf.data` support).
 
   Args:
-    inputs: single or list/tuple/dict of numpy array.
-  Returns:
-    numpy arrays can be used to build dataset.
-  """
-  if is_none_or_empty(inputs):
-    return None
-  flat_inputs = nest.flatten(inputs)
-  if len(flat_inputs) == 1:
-    return flat_inputs[0]
+    inputs: Structure of `Tensor`s, `NumPy` arrays, or tensor-like.
 
-  def _convert_non_tensor(x):
-    # Don't call `ops.convert_to_tensor_v2` on all `inputs` because
-    # `SparseTensors` can't be converted to `Tensor`.
+  Returns:
+    Structure of `Tensor`s or tensor-like.
+  """
+
+  def _convert_numpy_and_scipy(x):
     if isinstance(x, np.ndarray):
-      return ops.convert_to_tensor_v2(x)
+      dtype = None
+      if issubclass(x.dtype.type, np.floating):
+        dtype = backend.floatx()
+      return ops.convert_to_tensor(x, dtype=dtype)
+    elif scipy_sparse and scipy_sparse.issparse(x):
+      return _scipy_sparse_to_sparse_tensor(x)
     return x
 
-  inputs = nest.map_structure(_convert_non_tensor, inputs)
-  # For more complicated structure, we only convert the out most list to tuple
-  # since dataset will stack the list, but treat elements in the tuple as
-  # individual element.
-  return training_utils.list_to_tuple(inputs)
+  inputs = nest.map_structure(_convert_numpy_and_scipy, inputs)
+  return nest._list_to_tuple(inputs)  # pylint: disable=protected-access
 
 
 def is_none_or_empty(inputs):
@@ -1147,8 +1065,6 @@ def assert_not_namedtuple(x):
 class DataHandler(object):
   """Handles iterating over epoch-level `tf.data.Iterator` objects."""
 
-  # TODO(omalleyt): Handle `validation_split` with separate utility.
-  # TODO(omalleyt): Handle `validation_data` batch size when `x` is a gen.
   def __init__(self,
                x,
                y=None,
@@ -1161,7 +1077,8 @@ class DataHandler(object):
                class_weight=None,
                max_queue_size=10,
                workers=1,
-               use_multiprocessing=False):
+               use_multiprocessing=False,
+               model=None):
 
     self._initial_epoch = initial_epoch
     self._epochs = epochs
@@ -1173,20 +1090,21 @@ class DataHandler(object):
         y,
         batch_size=batch_size,
         steps=steps_per_epoch,
-        epochs=epochs,
+        epochs=epochs - initial_epoch,
         sample_weights=sample_weight,
         shuffle=shuffle,
         max_queue_size=max_queue_size,
         workers=workers,
         use_multiprocessing=use_multiprocessing,
-        distribution_strategy=ds_context.get_strategy())
+        distribution_strategy=ds_context.get_strategy(),
+        model=model)
 
     strategy = ds_context.get_strategy()
     dataset = self._train_adapter.get_dataset()
     if class_weight:
       dataset = dataset.map(_make_class_weight_map_fn(class_weight))
+    self._steps_per_epoch = self._infer_steps(steps_per_epoch, dataset)
     self._train_dataset = strategy.experimental_distribute_dataset(dataset)
-    self._steps_per_epoch = self._infer_steps(steps_per_epoch)
 
   def enumerate_epochs(self):
     """Yields `(epoch, tf.data.Iterator)`."""
@@ -1231,7 +1149,7 @@ class DataHandler(object):
       yield self._current_step
       self._current_step += 1
 
-  def _infer_steps(self, steps):
+  def _infer_steps(self, steps, dataset):
     """Infers steps_per_epoch needed to loop through a dataset."""
     if steps is not None:
       return steps
@@ -1240,7 +1158,6 @@ class DataHandler(object):
     if adapter_steps is not None:
       return adapter_steps
 
-    dataset = self._train_dataset
     if (ds_context.get_strategy().extended._in_multi_worker_mode() and  # pylint: disable=protected-access
         (dataset.options().experimental_distribute.auto_shard_policy !=
          distribute_options.AutoShardPolicy.OFF)):
@@ -1256,6 +1173,14 @@ class DataHandler(object):
       return size
     return None
 
+  @property
+  def _samples(self):
+    return self._train_adapter.get_samples()
+
+  @property
+  def _steps(self):
+    return self._train_adapter.get_size()
+
 
 def _make_class_weight_map_fn(class_weight):
   """Applies class weighting to a `Dataset`.
@@ -1280,25 +1205,29 @@ def _make_class_weight_map_fn(class_weight):
     raise ValueError(error_msg)
 
   class_weight_tensor = ops.convert_to_tensor_v2(
-      [class_weight[c] for c in class_ids])
+      [int(class_weight[c]) for c in class_ids], dtype="int64")
 
   def _class_weights_map_fn(*data):
     """Convert `class_weight` to `sample_weight`."""
-    if len(data) == 2:
-      x, y = data
-      sw = None
-    else:
-      x, y, sw = data
+    x, y, sw = unpack_x_y_sample_weight(data)
 
     if nest.is_sequence(y):
       raise ValueError(
-          "`class_weight` is only supported for `Model`s with a single output.")
+          "`class_weight` is only supported for Models with a single output.")
 
-    cw = array_ops.gather_v2(class_weight_tensor, y)
+    if y.shape.rank > 2:
+      raise ValueError("`class_weight` not supported for "
+                       "3+ dimensional targets.")
+
+    y_classes = smart_cond.smart_cond(
+        y.shape.rank == 2 and backend.shape(y)[1] > 1,
+        lambda: backend.argmax(y, axis=1),
+        lambda: math_ops.cast(backend.reshape(y, (-1,)), dtypes.int64))
+
+    cw = array_ops.gather_v2(class_weight_tensor, y_classes)
     if sw is not None:
       cw = math_ops.cast(cw, sw.dtype)
-      if len(cw.shape.as_list()) > len(sw.shape.as_list()):
-        cw = array_ops.squeeze(cw)
+      sw, cw = expand_1d((sw, cw))
       # `class_weight` and `sample_weight` are multiplicative.
       sw = sw * cw
     else:
@@ -1309,6 +1238,18 @@ def _make_class_weight_map_fn(class_weight):
   return _class_weights_map_fn
 
 
+def expand_1d(data):
+  """Expands 1-dimensional `Tensor`s into 2-dimensional `Tensor`s."""
+
+  def _expand_single_1d_tensor(t):
+    if (hasattr(t, "shape") and
+        isinstance(t.shape, tensor_shape.TensorShape) and t.shape.rank == 1):
+      return array_ops.expand_dims_v2(t, axis=-1)
+    return t
+
+  return nest.map_structure(_expand_single_1d_tensor, data)
+
+
 def train_validation_split(arrays, validation_split, shuffle=True):
   """Split arrays into random train and validation subsets.
 
@@ -1368,3 +1309,60 @@ def train_validation_split(arrays, validation_split, shuffle=True):
       functools.partial(_split, indices=val_indices), arrays)
 
   return train_arrays, val_arrays
+
+
+def unpack_x_y_sample_weight(data):
+  """Unpacks user-provided data tuple."""
+  if not isinstance(data, tuple):
+    return (data, None, None)
+  elif len(data) == 1:
+    return (data[0], None, None)
+  elif len(data) == 2:
+    return (data[0], data[1], None)
+  elif len(data) == 3:
+    return (data[0], data[1], data[2])
+
+  raise ValueError("Data not understood.")
+
+
+def pack_x_y_sample_weight(x, y=None, sample_weight=None):
+  """Packs user-provided data into a tuple."""
+  if y is None:
+    return (x,)
+  elif sample_weight is None:
+    return (x, y)
+  else:
+    return (x, y, sample_weight)
+
+
+def single_batch_iterator(strategy,
+                          x,
+                          y=None,
+                          sample_weight=None,
+                          class_weight=None):
+  """Creates a single-batch dataset."""
+  x, y, sample_weight = _process_tensorlike((x, y, sample_weight))
+  if y is None:
+    data = (x,)
+  elif sample_weight is None:
+    data = (x, y)
+  else:
+    data = (x, y, sample_weight)
+
+  dataset = dataset_ops.DatasetV2.from_tensors(data)
+  if class_weight:
+    dataset = dataset.map(_make_class_weight_map_fn(class_weight))
+  dataset = strategy.experimental_distribute_dataset(dataset)
+  return iter(dataset)
+
+
+def _scipy_sparse_to_sparse_tensor(t):
+  """Converts a SciPy sparse matrix to a SparseTensor."""
+  sparse_coo = t.tocoo()
+  row, col = sparse_coo.row, sparse_coo.col
+  data, shape = sparse_coo.data, sparse_coo.shape
+  if issubclass(data.dtype.type, np.floating):
+    data = data.astype(backend.floatx())
+  indices = np.concatenate(
+      (np.expand_dims(row, axis=1), np.expand_dims(col, axis=1)), axis=1)
+  return sparse_tensor.SparseTensor(indices, data, shape)
diff --git a/tensorflow/python/keras/engine/data_adapter_test.py b/tensorflow/python/keras/engine/data_adapter_test.py
index 1bb91303aa8..75ddf0f7d6e 100644
--- a/tensorflow/python/keras/engine/data_adapter_test.py
+++ b/tensorflow/python/keras/engine/data_adapter_test.py
@@ -124,11 +124,6 @@ class TensorLikeDataAdapterTest(DataAdapterTestBase):
     self.assertFalse(self.adapter_cls.can_handle(self.generator_input))
     self.assertFalse(self.adapter_cls.can_handle(self.sequence_input))
 
-  def test_iterator_expect_batch_size_numpy(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'`batch_size` or `steps` is required'):
-      self.adapter_cls(self.numpy_input, self.numpy_target)
-
   def test_size_numpy(self):
     adapter = self.adapter_cls(
         self.numpy_input, self.numpy_target, batch_size=5)
@@ -428,12 +423,6 @@ class GenericArrayLikeDataAdapterTest(DataAdapterTestBase):
     self.assertFalse(self.adapter_cls.can_handle(self.generator_input))
     self.assertFalse(self.adapter_cls.can_handle(self.sequence_input))
 
-  def test_iterator_expect_batch_size_generic_arraylike(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'`batch_size` or `steps` is required'):
-      self.adapter_cls(self.arraylike_input,
-                       self.arraylike_target)
-
   def test_size(self):
     adapter = self.adapter_cls(
         self.arraylike_input,
@@ -885,6 +874,7 @@ class DataHandlerTest(keras_parameterized.TestCase):
 
   def test_insufficient_data(self):
     ds = dataset_ops.DatasetV2.from_tensor_slices([0, 1])
+    ds = ds.filter(lambda *args, **kwargs: True)
     data_handler = data_adapter.DataHandler(
         ds, initial_epoch=0, epochs=2, steps_per_epoch=3)
     returned_data = []
@@ -963,53 +953,6 @@ class DataHandlerTest(keras_parameterized.TestCase):
     self.assertEqual(returned_data, [[([0],), ([1],),
                                       ([2],)], [([0],), ([1],), ([2],)]])
 
-  def test_class_weight(self):
-    data_handler = data_adapter.DataHandler(
-        x=[[0], [1], [2]],
-        y=[[2], [1], [0]],
-        class_weight={
-            0: 0.5,
-            1: 1.,
-            2: 1.5
-        },
-        epochs=2,
-        steps_per_epoch=3)
-    returned_data = []
-    for _, iterator in data_handler.enumerate_epochs():
-      epoch_data = []
-      for _ in data_handler.steps():
-        epoch_data.append(next(iterator))
-      returned_data.append(epoch_data)
-    returned_data = self.evaluate(returned_data)
-    self.assertEqual(returned_data, [[([0], [2], [1.5]), ([1], [1], [1.]),
-                                      ([2], [0], [0.5])],
-                                     [([0], [2], [1.5]), ([1], [1], [1.]),
-                                      ([2], [0], [0.5])]])
-
-  def test_class_weight_and_sample_weight(self):
-    data_handler = data_adapter.DataHandler(
-        x=[[0], [1], [2]],
-        y=[[2], [1], [0]],
-        sample_weight=[[1.], [2.], [4.]],
-        class_weight={
-            0: 0.5,
-            1: 1.,
-            2: 1.5
-        },
-        epochs=2,
-        steps_per_epoch=3)
-    returned_data = []
-    for _, iterator in data_handler.enumerate_epochs():
-      epoch_data = []
-      for _ in data_handler.steps():
-        epoch_data.append(next(iterator))
-      returned_data.append(epoch_data)
-    returned_data = self.evaluate(returned_data)
-    self.assertEqual(returned_data, [[([0], [2], [1.5]), ([1], [1], [2.]),
-                                      ([2], [0], [2.])],
-                                     [([0], [2], [1.5]), ([1], [1], [2.]),
-                                      ([2], [0], [2.])]])
-
   def test_class_weight_user_errors(self):
     with self.assertRaisesRegexp(ValueError, 'to be a dict with keys'):
       data_adapter.DataHandler(
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index deb3bd27928..166553a324b 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -40,6 +40,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
+from tensorflow.python.keras.engine import compile_utils
 from tensorflow.python.keras.engine import input_layer as input_layer_module
 from tensorflow.python.keras.engine import node as node_module
 from tensorflow.python.keras.engine import training_utils
@@ -50,6 +51,7 @@ from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
@@ -200,7 +202,10 @@ class Network(base_layer.Layer):
 
     super(Network, self).__init__(name=name, **kwargs)
 
+    self.output_names = None
+    self.input_names = None
     self._is_compiled = False
+    self._saved_model_inputs_spec = None
 
     # This is True for Sequential networks and Functional networks.
     self._compute_output_and_mask_jointly = False
@@ -326,6 +331,7 @@ class Network(base_layer.Layer):
         self._feed_inputs.append(layer.input)
 
     self._compute_tensor_usage_count()
+    self._set_save_spec(self._nested_inputs)
 
   def _set_output_names(self):
     """Assigns unique names to the Network's outputs.
@@ -354,8 +360,8 @@ class Network(base_layer.Layer):
     self._autocast = kwargs.get('autocast',
                                 base_layer_utils.v2_dtype_behavior_enabled())
     self._supports_ragged_inputs = None
-    self.outputs = []
-    self.inputs = []
+    self.outputs = None
+    self.inputs = None
     self.built = False
     self._build_input_shape = None
 
@@ -573,24 +579,7 @@ class Network(base_layer.Layer):
         A list of `InputSpec` instances (one per input to the model)
             or a single instance if the model has only one input.
     """
-    # If subclassed model, can't assume anything.
-    if not self._is_graph_network:
-      return None
-
-    specs = []
-    for layer in self._input_layers:
-      if layer.input_spec is None:
-        specs.append(None)
-      else:
-        if not isinstance(layer.input_spec, list):
-          raise TypeError('Layer ' + layer.name +
-                          ' has an input_spec attribute that '
-                          'is not a list. We expect a list. '
-                          'Found input_spec = ' + str(layer.input_spec))
-        specs += layer.input_spec
-    if len(specs) == 1:
-      return specs[0]
-    return specs
+    return
 
   @base_layer_utils.default
   def build(self, input_shape):
@@ -648,6 +637,11 @@ class Network(base_layer.Layer):
         if isinstance(input_shape, list):
           x = [base_layer_utils.generate_placeholders_from_shape(shape)
                for shape in input_shape]
+        elif isinstance(input_shape, dict):
+          x = {
+              k: base_layer_utils.generate_placeholders_from_shape(shape)
+              for k, shape in input_shape.items()
+          }
         else:
           x = base_layer_utils.generate_placeholders_from_shape(input_shape)
 
@@ -834,8 +828,7 @@ class Network(base_layer.Layer):
     tensor_dict = {}
 
     for x, y in zip(self.inputs, inputs):
-      x_id = str(id(x))
-      tensor_dict[x_id] = [y] * self._tensor_usage_count[x_id]
+      # Set shape and dtype based on `keras.Input`s.
       if isinstance(x, ops.Tensor) and isinstance(y, ops.Tensor):
         try:
           y.set_shape(y.shape.merge_with(x.shape))
@@ -844,6 +837,11 @@ class Network(base_layer.Layer):
               'Model was constructed with shape {} for input {}, but it was '
               're-called on a Tensor with incompatible shape {}.'
               .format(x, x.shape, y.shape))
+      if isinstance(x, (ops.Tensor, composite_tensor.CompositeTensor)):
+        y = math_ops.cast(y, dtype=x.dtype)
+
+      x_id = str(id(x))
+      tensor_dict[x_id] = [y] * self._tensor_usage_count[x_id]
 
     depth_keys = list(self._nodes_by_depth.keys())
     depth_keys.sort(reverse=True)
@@ -1533,6 +1531,32 @@ class Network(base_layer.Layer):
     new_layers.append(add_metric_layer)
     self._insert_layers(new_layers, new_nodes)
 
+  @trackable.no_automatic_dependency_tracking
+  def _set_save_spec(self, inputs):
+    if self._saved_model_inputs_spec is not None:
+      return  # Already set.
+
+    input_names = self.input_names
+    if not input_names:
+      input_names = compile_utils.create_pseudo_input_names(inputs)
+
+    flat_inputs = nest.flatten(inputs)
+    specs = []
+    for name, tensor in zip(input_names, flat_inputs):
+      specs.append(
+          tf_utils.get_tensor_spec(tensor, dynamic_batch=False, name=name))
+    specs = nest.pack_sequence_as(inputs, specs)
+
+    self._saved_model_inputs_spec = specs
+
+  def _get_save_spec(self, dynamic_batch=True):
+    if self._saved_model_inputs_spec is None:
+      return None
+
+    return nest.map_structure(
+        lambda t: tf_utils.get_tensor_spec(t, dynamic_batch=dynamic_batch),
+        self._saved_model_inputs_spec)
+
   @property
   def _trackable_saved_model_saver(self):
     return network_serialization.NetworkSavedModelSaver(self)
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index a86084f1a35..4ae06bc46e1 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -266,6 +266,10 @@ class Sequential(training.Model):
     self.built = True
 
   def call(self, inputs, training=None, mask=None):  # pylint: disable=redefined-outer-name
+    if self._build_input_shape is None:
+      input_shapes = nest.map_structure(_get_shape_tuple, inputs)
+      self._build_input_shape = input_shapes
+
     if self._is_graph_network:
       if not self.built:
         self._init_graph_network(self.inputs, self.outputs, name=self.name)
@@ -364,7 +368,7 @@ class Sequential(training.Model):
         'name': self.name,
         'layers': copy.deepcopy(layer_configs)
     }
-    if self._build_input_shape:
+    if self._build_input_shape is not None:
       config['build_input_shape'] = self._build_input_shape
     return config
 
@@ -383,7 +387,8 @@ class Sequential(training.Model):
       layer = layer_module.deserialize(layer_config,
                                        custom_objects=custom_objects)
       model.add(layer)
-    if not model.inputs and build_input_shape:
+    if (not model.inputs and build_input_shape and
+        isinstance(build_input_shape, (tuple, list))):
       model.build(build_input_shape)
     return model
 
@@ -396,3 +401,12 @@ class Sequential(training.Model):
   @property
   def _trackable_saved_model_saver(self):
     return model_serialization.SequentialSavedModelSaver(self)
+
+
+def _get_shape_tuple(t):
+  if hasattr(t, 'shape'):
+    shape = t.shape
+    if shape.rank is not None:
+      return tuple(shape.as_list())
+    return None
+  return None
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index 65e58fd82cd..b5f24674b06 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -286,9 +286,16 @@ class TestSequential(keras_parameterized.TestCase):
     self.assertTrue(model.built)
 
     config = model.get_config()
-    self.assertIn('build_input_shape', config)
-
     new_model = keras.models.Sequential.from_config(config)
+    new_model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        metrics=[keras.metrics.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly(),
+        experimental_run_tf_function=testing_utils.should_run_tf_function())
+    x = np.random.random((batch_size, input_dim))
+    y = np.random.random((batch_size, num_classes))
+    new_model.train_on_batch(x, y)
     self.assertEqual(len(new_model.layers), 2)
     self.assertEqual(len(new_model.weights), 4)
 
@@ -321,15 +328,12 @@ class TestSequential(keras_parameterized.TestCase):
     self.assertFalse(model.built)
     model(array_ops.zeros([1, 2]))
     self.assertTrue(model.built)
-    self.assertEqual(len(model.outputs), 0)
     model.compile(
         'rmsprop',
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
         experimental_run_tf_function=testing_utils.should_run_tf_function())
-    self.assertEqual(len(model.outputs), 0)
     model.train_on_batch(np.zeros((1, 2)), np.zeros((1, 5)))
-    self.assertEqual(len(model.outputs), 1)
 
   @keras_parameterized.run_all_keras_modes
   def test_sequential_nesting(self):
@@ -399,29 +403,21 @@ class TestSequential(keras_parameterized.TestCase):
         ValueError, 'should have a single output tensor'):
       keras.Sequential([MultiOutputLayer()])(np.zeros((10, 10)))
 
-  @keras_parameterized.run_all_keras_modes
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_layer_add_after_compile_deferred(self):
     model = keras.Sequential([keras.layers.Dense(3)])
-
     self.assertFalse(model.built)
-    self.assertFalse(model.inputs)
-    self.assertFalse(model.outputs)
 
     model.compile('adam', loss='mse')
     model.fit(np.random.random((1, 3)), np.random.random((1, 3)))
-
     self.assertTrue(model.built)
-    self.assertTrue(model.inputs)
-    self.assertTrue(model.outputs)
 
     model.add(keras.layers.Dense(3))
-
-    self.assertTrue(model.built)
-    self.assertTrue(model.inputs)
-    self.assertTrue(model.outputs)
+    self.assertFalse(model.built)
 
     model.compile('adam', loss='mse')
     model.fit(np.random.random((1, 3)), np.random.random((1, 3)))
+    self.assertTrue(model.built)
 
   def test_sequential_layer_tracking(self):
     """Test that Sequential only tracks layers added in init or `.add`."""
@@ -442,21 +438,6 @@ class TestSequential(keras_parameterized.TestCase):
     model.pop()
     self.assertEqual(model._layers[-1], layer)
 
-  @testing_utils.enable_v2_dtype_behavior
-  def test_sequential_does_not_autocast(self):
-
-    class AssertFloat64InputLayer(keras.layers.Layer):
-
-      def __init__(self):
-        super(AssertFloat64InputLayer, self).__init__(autocast=False)
-
-      def call(self, inputs):
-        assert inputs.dtype == 'float64', 'inputs are %s' % inputs.dtype
-        return array_ops.identity(inputs)
-
-    model = keras.Sequential([AssertFloat64InputLayer(), keras.layers.Dense(4)])
-    model(np.random.random((4, 4)))
-
 
 class TestSequentialEagerIntegration(keras_parameterized.TestCase):
 
@@ -500,27 +481,6 @@ class TestSequentialEagerIntegration(keras_parameterized.TestCase):
     y = np.random.random((2, 5))
     model.fit(x, y, epochs=1)
 
-  @keras_parameterized.run_all_keras_modes
-  def test_sequential_model_fails_with_dict_inputs(self):
-    num_classes = 5
-    model = testing_utils.get_small_sequential_mlp(
-        num_hidden=10, num_classes=num_classes)
-    model.compile(
-        'rmsprop',
-        metrics=['acc'],
-        weighted_metrics=['mae'],
-        loss='categorical_crossentropy',
-        run_eagerly=testing_utils.should_run_eagerly(),
-        experimental_run_tf_function=testing_utils.should_run_tf_function())
-
-    x = {'dense_input': np.random.random((10, 1))}
-    y = np.random.randint(num_classes, size=(10, 1))
-
-    with self.assertRaisesRegexp(
-        ValueError, 'Passing a dictionary input to a Sequential Model which '
-        'doesn\'t have FeatureLayer as the first layer is an error'):
-      model.fit(x, y, batch_size=5, epochs=1)
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 298c09a0f12..7e86d9e2d8b 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -18,61 +18,73 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
-import numpy as np
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.distribute import distribute_coordinator as dc
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import values as ds_values
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import monitoring
-from tensorflow.python.framework import composite_tensor_utils
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import losses
-from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import callbacks as callbacks_module
 from tensorflow.python.keras import optimizers
-from tensorflow.python.keras.distribute import distributed_training_utils
+from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils
+from tensorflow.python.keras.engine import compile_utils
+from tensorflow.python.keras.engine import data_adapter
 from tensorflow.python.keras.engine import network
-from tensorflow.python.keras.engine import training_distributed
 from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.engine import training_v2
-from tensorflow.python.keras.engine import training_v2_utils
-from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
-from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer as lso
 from tensorflow.python.keras.saving.saved_model import model_serialization
-from tensorflow.python.keras.utils import data_utils
-from tensorflow.python.keras.utils import losses_utils
-from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils import version_utils
 from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.losses import util as tf_losses_utils
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops.ragged import ragged_concat_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.training.tracking import base as trackable
-from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
+from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import keras_export
 
-try:
-  from scipy.sparse import issparse  # pylint: disable=g-import-not-at-top
-except ImportError:
-  issparse = None
 
 _keras_api_gauge = monitoring.BoolGauge('/tensorflow/api/keras',
                                         'keras api usage', 'method')
 
 
+def enable_multi_worker(method):
+  """Decorator that handles running `method` with multi-worker strategy."""
+
+  def _method_wrapper(self, *args, **kwargs):
+    if not self._in_multi_worker_mode():  # pylint: disable=protected-access
+      return method(self, *args, **kwargs)
+
+    return dc.run_distribute_coordinator(
+        lambda _: method(self, *args, **kwargs),
+        self.distribute_strategy,
+        mode=dc.CoordinatorMode.INDEPENDENT_WORKER)
+
+  return tf_decorator.make_decorator(
+      target=method, decorator_func=_method_wrapper)
+
+
+def disable_multi_worker(method):
+  """Decorator that disallows multi-worker use of `method`."""
+
+  def _method_wrapper(self, *args, **kwargs):
+    strategy = self.distribute_strategy
+    if (self._in_multi_worker_mode() or dist_utils.is_tpu_strategy(strategy) and  # pylint: disable=protected-access
+        strategy.extended.num_hosts > 1):
+      raise ValueError('{} is not supported in multi-worker mode.'.format(
+          method.__name__))
+
+    return method(self, *args, **kwargs)
+
+  return tf_decorator.make_decorator(
+      target=method, decorator_func=_method_wrapper)
+
+
 @keras_export('keras.Model', 'keras.models.Model')
 class Model(network.Network, version_utils.ModelVersionSelector):
   """`Model` groups layers into an object with training and inference features.
@@ -148,7 +160,6 @@ class Model(network.Network, version_utils.ModelVersionSelector):
   def __init__(self, *args, **kwargs):
     super(Model, self).__init__(*args, **kwargs)
     _keras_api_gauge.get_cell('model').set(True)
-
     # Model must be created under scope of DistStrat it will be trained with.
     if ds_context.has_strategy():
       self._distribution_strategy = ds_context.get_strategy()
@@ -156,6 +167,12 @@ class Model(network.Network, version_utils.ModelVersionSelector):
       self._distribution_strategy = None
     # Defaults to value of `tf.config.experimental_functions_run_eagerly`.
     self._run_eagerly = None
+    self.stop_training = False
+    # Initialize cache attrs.
+    self._reset_compile_cache()
+
+    # Fault-tolerance handler. Set in `ModelCheckpoint`.
+    self._training_state = None
 
   def get_weights(self):
     """Retrieves the weights of the model.
@@ -212,14 +229,13 @@ class Model(network.Network, version_utils.ModelVersionSelector):
         ValueError: If `skip_mismatch` is set to `True` when `by_name` is
           `False`.
     """
-    if distributed_training_utils.is_tpu_strategy(self._distribution_strategy):
+    if dist_utils.is_tpu_strategy(self._distribution_strategy):
       if (self._distribution_strategy.extended.steps_per_run > 1 and
           (not network._is_hdf5_filepath(filepath))):  # pylint: disable=protected-access
         raise ValueError('Load weights is not yet supported with TPUStrategy '
                          'with steps_per_run greater than 1.')
     return super(Model, self).load_weights(filepath, by_name, skip_mismatch)
 
-  @trackable.no_automatic_dependency_tracking
   def compile(self,
               optimizer='rmsprop',
               loss=None,
@@ -291,105 +307,52 @@ class Model(network.Network, version_utils.ModelVersionSelector):
         ValueError: In case of invalid arguments for
             `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
     """
+    _keras_api_gauge.get_cell('compile').set(True)
     self._validate_compile(optimizer, **kwargs)
     self._run_eagerly = kwargs.pop('run_eagerly', None)
-    self._set_optimizer(optimizer)
-    # We've disabled automatic dependency tracking for this method, but do want
-    # to add a checkpoint dependency on the optimizer if it's trackable.
-    if isinstance(self.optimizer, trackable.Trackable):
-      self._track_trackable(
-          self.optimizer, name='optimizer', overwrite=True)
-    self.loss = loss or {}
-    self.loss_weights = loss_weights
-    self.sample_weight_mode = sample_weight_mode
-    self._compile_metrics = metrics or []
-    self._compile_weighted_metrics = weighted_metrics
 
-    # _training_endpoints contains a list of _TrainingEndpoint object, which has
-    # all the model output/target/loss and related metadata.
-    self._training_endpoints = []
+    self.optimizer = self._get_optimizer(optimizer)
+    self.compiled_loss = compile_utils.LossesContainer(
+        loss, loss_weights, output_names=self.output_names)
+    self.compiled_metrics = compile_utils.MetricsContainer(
+        metrics, weighted_metrics, output_names=self.output_names)
 
-    # Used to freeze the behavior of the Model once `compile` has been called.
-    self._compiled_trainable_state = self._get_trainable_state()
-
-    # Set tf.distribute.Strategy specific parameters.
-    self._distributed_model_cache = {}
-    self._distributed_function_cache = {}
-
-    # Clear any `_eager_losses` cached from a previous `Model.__call__`.
-    self._clear_losses()
-
-    # Initialize model metric attributes.
-    self._init_metric_attributes()
-    if not self.built or not self.inputs or not self.outputs:
-      # Model is not compilable because it does not know its number of inputs
-      # and outputs, nor their shapes and names. We will compile after the first
-      # time the model gets called on training data.
-      return
+    # Initializes attrs that are reset each time `compile` is called.
+    self._reset_compile_cache()
     self._is_compiled = True
-    _keras_api_gauge.get_cell('compile').set(True)
 
-    # Prepare list of loss functions, same size of model outputs.
-    self.loss_functions = training_utils.prepare_loss_functions(
-        self.loss, self.output_names)
+    self.loss = loss or {}  # Backwards compat.
 
-    target_tensors = self._process_target_tensor_for_compile(None)
-    for o, n, l, t in zip(self.outputs, self.output_names,
-                          self.loss_functions, target_tensors):
-      endpoint = _TrainingEndpoint(o, n, l)
-      endpoint.create_training_target(t, run_eagerly=self.run_eagerly)
-      self._training_endpoints.append(endpoint)
+  def _get_optimizer(self, optimizer):
+    """Wraps `optimizer` in `LossScaleOptimizer` if necessary."""
 
-    # Prepare list loss weights, same size of model outputs.
-    training_utils.prepare_loss_weights(self._training_endpoints, loss_weights)
+    def _get_single_optimizer(opt):
+      opt = optimizers.get(opt)
+      if (self._dtype_policy.loss_scale is not None and
+          not isinstance(opt, lso.LossScaleOptimizer)):
+        opt = lso.LossScaleOptimizer(opt, self._dtype_policy.loss_scale)
+      return opt
 
-    # Initialization for Eager mode execution.
-    if self.run_eagerly:
-      self._compile_eagerly(metrics, weighted_metrics, sample_weight_mode)
-      return
-
-    with K.get_graph().as_default():
-      # Save all metric attributes per output of the model.
-      self._cache_output_metric_attributes(metrics, weighted_metrics)
-
-      # Set metric attributes on model.
-      self._set_metric_attributes()
-
-      # Invoke metric functions (unweighted) for all the outputs.
-      self._handle_metrics(
-          self.outputs,
-          targets=self._targets,
-          skip_target_masks=self._prepare_skip_target_masks(),
-          masks=self._prepare_output_masks())
-
-      # Prepare sample weight modes. List with the same length as model outputs.
-      training_utils.prepare_sample_weight_modes(
-          self._training_endpoints, sample_weight_mode)
-
-      # Creates the model loss and weighted metrics sub-graphs.
-      self._compile_weights_loss_and_weighted_metrics()
-
-      # Functions for train, test and predict will
-      # be compiled lazily when required.
-      # This saves time when the user is not using all functions.
-      self.train_function = None
-      self.test_function = None
-      self.predict_function = None
-
-      # Collected trainable weights, sorted in topological order.
-      self._collected_trainable_weights = self.trainable_weights
+    return nest.map_structure(_get_single_optimizer, optimizer)
 
   @trackable.no_automatic_dependency_tracking
-  def _init_distributed_function_cache_if_not_compiled(self):
-    if not hasattr(self, '_distributed_function_cache'):
-      self._distributed_function_cache = {}
+  def _reset_compile_cache(self):
+    self.train_function = None
+    self.test_function = None
+    self.predict_function = None
+
+    # Used to cache `trainable` attr of `Layer`s for `fit`.
+    self._compiled_trainable_state = self._get_trainable_state()
 
   @property
   def metrics(self):
     """Returns the model's metrics added using `compile`, `add_metric` APIs."""
     metrics = []
     if self._is_compiled:
-      metrics += self._compile_metric_functions
+      # TODO(omalleyt): Track `CompiledLoss` and `CompiledMetrics` objects
+      # so that attr names are not load-bearing.
+      metrics = self.compiled_loss.metrics + self.compiled_metrics.metrics
+
     all_layers = self._gather_unique_layers()
     for l in all_layers:
       metrics.extend(l._metrics)  # pylint: disable=protected-access
@@ -401,26 +364,12 @@ class Model(network.Network, version_utils.ModelVersionSelector):
 
     # This property includes all output names including `loss` and per-output
     # losses for backward compatibility.
-    metrics_names = ['loss']
-    if self._is_compiled:
-      # Add output loss metric names to the metric names list.
-      if len(self._training_endpoints) > 1:
-        metrics_names.extend([
-            e.loss_name()
-            for e in self._training_endpoints
-            if not e.should_skip_target()
-        ])
-
-    # Add all metric names.
-    metrics_names += [m.name for m in self.metrics]
-    return metrics_names
+    return [m.name for m in self.metrics]
 
   @property
   def distribute_strategy(self):
     """The `tf.distribute.Strategy` this model was created under."""
-    if self._distribution_strategy is None:
-      return ds_context._get_default_strategy()  # pylint: disable=protected-access
-    return self._distribution_strategy
+    return self._distribution_strategy or ds_context.get_strategy()
 
   @property
   def run_eagerly(self):
@@ -465,26 +414,93 @@ class Model(network.Network, version_utils.ModelVersionSelector):
   def run_eagerly(self, value):
     self._run_eagerly = value
 
-  def _select_training_loop(self, inputs):
-    """Select training loop for fit/eval/predict based on the inputs."""
-    # TODO(kaftan) or TODO(scottzhu): This check should eventually be nicely
-    #  integrated into the data adapters in the v2 loop. We can't do this yet
-    #  because we currently have to fall back for unhandled data types.
-    if isinstance(inputs, (iterator_ops.Iterator,
-                           iterator_ops.OwnedIterator)):
-      raise ValueError('For performance reasons Keras `fit`, `evaluate` and'
-                       '`predict` accept tf.data `Datasets` as input but not '
-                       'iterators that have been manually generated from '
-                       'Datasets by users. Please directly pass in the '
-                       'original `Dataset` object instead of passing in '
-                       '`iter(dataset)`.')
+  def _train_step(self, data):
+    """The logic for one training step.
 
-    if self._in_multi_worker_mode():
-      return training_distributed.DistributionMultiWorkerTrainingLoop(
-          training_v2.Loop())
-    else:
-      return training_v2.Loop()
+    This method can be overridden to support custom training logic.
+    This method is called by `Model._make_train_function`.
 
+    This method should contain the mathemetical logic for one step of training.
+    This typically includes the forward pass, loss calculation, backpropagation,
+    and metric updates.
+
+    Configuration details for *how* this logic is run (e.g. `tf.function` and
+    `tf.distribute.Strategy` settings), should be left to
+    `Model._make_train_function`, which can also be overridden.
+
+    Arguments:
+      data: A nested structure of `Tensor`s.
+
+    Returns:
+      A `dict` containing values that will be passed to
+      `tf.keras.callbacks.CallbackList.on_train_batch_end`. Typically, the
+      values of the `Model`'s metrics are returned. Example:
+      `{'loss': 0.2, 'accuracy': 0.7}`.
+
+    """
+    # These are the only transformations `Model.fit` applies to user-input
+    # data when a `tf.data.Dataset` is provided. These utilities will be exposed
+    # publicly.
+    data = data_adapter.expand_1d(data)
+    x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
+
+    with backprop.GradientTape() as tape:
+      y_pred = self(x, training=True)
+      loss = self.compiled_loss(
+          y, y_pred, sample_weight, regularization_losses=self.losses)
+      if isinstance(self.optimizer, lso.LossScaleOptimizer):
+        loss = self.optimizer.get_scaled_loss(loss)
+
+    trainable_variables = self.trainable_variables
+    gradients = tape.gradient(loss, trainable_variables)
+    if isinstance(self.optimizer, lso.LossScaleOptimizer):
+      gradients = self.optimizer.get_unscaled_gradients(gradients)
+    gradients = self.optimizer._clip_gradients(gradients)  # pylint: disable=protected-access
+    if trainable_variables:
+      self.optimizer.apply_gradients(zip(gradients, trainable_variables))
+
+    self.compiled_metrics.update_state(y, y_pred, sample_weight)
+    return {m.name: m.result() for m in self.metrics}
+
+  def _make_train_function(self):
+    """Creates a function that executes one step of training.
+
+    This method can be overridden to support custom training logic.
+    This method is called by `Model.fit` and `Model.train_on_batch`.
+
+    Typically, this method directly controls `tf.function` and
+    `tf.distribute.Strategy` settings, and delegates the actual training
+    logic to `Model._train_step`.
+
+    This function is cached the first time `Model.fit` or
+    `Model.train_on_batch` is called. The cache is cleared whenever
+    `Model.compile` is called.
+
+    Returns:
+      Function. The function created by this method should accept a
+      `tf.data.Iterator`, and return a `dict` containing values that will
+      be passed to `tf.keras.Callbacks.on_train_batch_end`, such as
+      `{'loss': 0.2, 'accuracy': 0.7}`.
+    """
+    if self.train_function is not None:
+      return self.train_function
+
+    def train_function(iterator):
+      data = next(iterator)
+      outputs = self.distribute_strategy.experimental_run_v2(
+          self._train_step, args=(data,))
+      outputs = reduce_per_replica(
+          outputs, self.distribute_strategy, reduction='first')
+      return outputs
+
+    if not self.run_eagerly:
+      train_function = def_function.function(
+          train_function, experimental_relax_shapes=True)
+
+    self.train_function = train_function
+    return self.train_function
+
+  @enable_multi_worker
   def fit(self,
           x=None,
           y=None,
@@ -500,6 +516,7 @@ class Model(network.Network, version_utils.ModelVersionSelector):
           initial_epoch=0,
           steps_per_epoch=None,
           validation_steps=None,
+          validation_batch_size=None,
           validation_freq=1,
           max_queue_size=10,
           workers=1,
@@ -532,9 +549,8 @@ class Model(network.Network, version_utils.ModelVersionSelector):
             Number of samples per gradient update.
             If unspecified, `batch_size` will default to 32.
             Do not specify the `batch_size` if your data is in the
-            form of symbolic tensors, datasets,
-            generators, or `keras.utils.Sequence` instances (since they generate
-            batches).
+            form of datasets, generators, or `keras.utils.Sequence` instances
+            (since they generate batches).
         epochs: Integer. Number of epochs to train the model.
             An epoch is an iteration over the entire `x` and `y`
             data provided.
@@ -624,6 +640,12 @@ class Model(network.Network, version_utils.ModelVersionSelector):
             the dataset will be consumed, the evaluation will start from the
             beginning of the dataset at each epoch. This ensures that the same
             validation samples are used every time.
+        validation_batch_size: Integer or `None`.
+            Number of samples per validation batch.
+            If unspecified, will default to `batch_size`.
+            Do not specify the `validation_batch_size` if your data is in the
+            form of datasets, generators, or `keras.utils.Sequence` instances
+            (since they generate batches).
         validation_freq: Only relevant if validation data is provided. Integer
             or `collections_abc.Container` instance (e.g. list, tuple, etc.).
             If an integer, specifies how many training epochs to run before a
@@ -685,38 +707,160 @@ class Model(network.Network, version_utils.ModelVersionSelector):
     _keras_api_gauge.get_cell('fit').set(True)
     # Legacy graph support is contained in `training_v1.Model`.
     version_utils.disallow_legacy_graph('Model', 'fit')
-    # Legacy support
-    if 'nb_epoch' in kwargs:
-      logging.warning(
-          'The `nb_epoch` argument in `fit` has been renamed `epochs`.')
-      epochs = kwargs.pop('nb_epoch')
-    if kwargs:
-      raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
     self._assert_compile_was_called()
     self._check_call_args('fit')
 
-    func = self._select_training_loop(x)
-    return func.fit(
-        self,
-        x=x,
-        y=y,
-        batch_size=batch_size,
-        epochs=epochs,
-        verbose=verbose,
-        callbacks=callbacks,
-        validation_split=validation_split,
-        validation_data=validation_data,
-        shuffle=shuffle,
-        class_weight=class_weight,
-        sample_weight=sample_weight,
-        initial_epoch=initial_epoch,
-        steps_per_epoch=steps_per_epoch,
-        validation_steps=validation_steps,
-        validation_freq=validation_freq,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing)
+    if validation_split:
+      # Create the validation data using the training data. Only supported for
+      # `Tensor` and `NumPy` input.
+      (x, y, sample_weight), validation_data = (
+          data_adapter.train_validation_split((x, y, sample_weight),
+                                              validation_split=validation_split,
+                                              shuffle=False))
 
+    with self.distribute_strategy.scope(), \
+         training_utils.RespectCompiledTrainableState(self):
+      # Creates a `tf.data.Dataset` and handles batch and epoch iteration.
+      data_handler = data_adapter.DataHandler(
+          x=x,
+          y=y,
+          sample_weight=sample_weight,
+          batch_size=batch_size,
+          steps_per_epoch=steps_per_epoch,
+          initial_epoch=initial_epoch,
+          epochs=epochs,
+          shuffle=shuffle,
+          class_weight=class_weight,
+          max_queue_size=max_queue_size,
+          workers=workers,
+          use_multiprocessing=use_multiprocessing,
+          model=self)
+
+      # Container that configures and calls `tf.keras.Callback`s.
+      callbacks = callbacks_module.CallbackList(
+          callbacks,
+          add_history=True,
+          add_progbar=True,
+          model=self,
+          verbose=verbose,
+          epochs=epochs,
+          steps=data_handler._steps)  # pylint: disable=protected-access
+
+      self.stop_training = False
+      train_function = self._make_train_function()
+      callbacks.on_train_begin()
+      # Handle fault-tolerance for multi-worker.
+      # TODO(omalleyt): Fix the ordering issues that mean this has to
+      # happen after `callbacks.on_train_begin`.
+      data_handler._initial_epoch = (  # pylint: disable=protected-access
+          self._maybe_load_initial_epoch_from_ckpt(initial_epoch))
+      for epoch, iterator in data_handler.enumerate_epochs():
+        self.reset_metrics()
+        callbacks.on_epoch_begin(epoch)
+        with data_handler.catch_stop_iteration():
+          for step in data_handler.steps():
+            callbacks.on_train_batch_begin(step)
+            logs = train_function(iterator)
+            callbacks.on_train_batch_end(step, logs)
+        epoch_logs = {m.name: m.result() for m in self.metrics}
+
+        # Run validation.
+        if validation_data and self._should_eval(epoch, validation_freq):
+          val_x, val_y, val_sample_weight = (
+              data_adapter.unpack_x_y_sample_weight(validation_data))
+          val_logs = self.evaluate(
+              x=val_x,
+              y=val_y,
+              sample_weight=val_sample_weight,
+              batch_size=validation_batch_size or batch_size,
+              steps=validation_steps,
+              callbacks=callbacks,
+              max_queue_size=max_queue_size,
+              workers=workers,
+              use_multiprocessing=use_multiprocessing,
+              return_dict=True)
+          val_logs = {'val_' + name: val for name, val in val_logs.items()}
+          epoch_logs.update(val_logs)
+
+        callbacks.on_epoch_end(epoch, epoch_logs)
+        if self.stop_training:
+          break
+
+      callbacks.on_train_end()
+      return self.history
+
+  def _test_step(self, data):
+    """The logic for one evaluation step.
+
+    This method can be overridden to support custom evaluation logic.
+    This method is called by `Model._make_test_function`.
+
+    This function should contain the mathemetical logic for one step of
+    evaluation.
+    This typically includes the forward pass, loss calculation, and metrics
+    updates.
+
+    Configuration details for *how* this logic is run (e.g. `tf.function` and
+    `tf.distribute.Strategy` settings), should be left to
+    `Model._make_test_function`, which can also be overridden.
+
+    Arguments:
+      data: A nested structure of `Tensor`s.
+
+    Returns:
+      A `dict` containing values that will be passed to
+      `tf.keras.callbacks.CallbackList.on_train_batch_end`. Typically, the
+      values of the `Model`'s metrics are returned.
+    """
+    data = data_adapter.expand_1d(data)
+    x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
+
+    y_pred = self(x, training=False)
+    # Updates stateful loss metrics.
+    self.compiled_loss(
+        y, y_pred, sample_weight, regularization_losses=self.losses)
+
+    self.compiled_metrics.update_state(y, y_pred, sample_weight)
+    return {m.name: m.result() for m in self.metrics}
+
+  def _make_test_function(self):
+    """Creates a function that executes one step of evaluation.
+
+    This method can be overridden to support custom evaluation logic.
+    This method is called by `Model.evaluate` and `Model.test_on_batch`.
+
+    Typically, this method directly controls `tf.function` and
+    `tf.distribute.Strategy` settings, and delegates the actual evaluation
+    logic to `Model._test_step`.
+
+    This function is cached the first time `Model.evaluate` or
+    `Model.test_on_batch` is called. The cache is cleared whenever
+    `Model.compile` is called.
+
+    Returns:
+      Function. The function created by this method should accept a
+      `tf.data.Iterator`, and return a `dict` containing values that will
+      be passed to `tf.keras.Callbacks.on_test_batch_end`.
+    """
+    if self.test_function is not None:
+      return self.test_function
+
+    def test_function(iterator):
+      data = next(iterator)
+      outputs = self.distribute_strategy.experimental_run_v2(
+          self._test_step, args=(data,))
+      outputs = reduce_per_replica(
+          outputs, self.distribute_strategy, reduction='first')
+      return outputs
+
+    if not self.run_eagerly:
+      test_function = def_function.function(
+          test_function, experimental_relax_shapes=True)
+
+    self.test_function = test_function
+    return self.test_function
+
+  @enable_multi_worker
   def evaluate(self,
                x=None,
                y=None,
@@ -727,76 +871,67 @@ class Model(network.Network, version_utils.ModelVersionSelector):
                callbacks=None,
                max_queue_size=10,
                workers=1,
-               use_multiprocessing=False):
+               use_multiprocessing=False,
+               return_dict=False):
     """Returns the loss value & metrics values for the model in test mode.
 
     Computation is done in batches.
 
     Arguments:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
-          - A dict mapping input names to the corresponding array/tensors,
-            if the model has named inputs.
-          - A `tf.data` dataset.
-          - A generator or `keras.utils.Sequence` instance.
-          A more detailed description of unpacking behavior for iterator types
-          (Dataset, generator, Sequence) is given in the `Unpacking behavior
-          for iterator-like inputs` section of `Model.fit`.
-        y: Target data. Like the input data `x`,
-          it could be either Numpy array(s) or TensorFlow tensor(s).
-          It should be consistent with `x` (you cannot have Numpy inputs and
-          tensor targets, or inversely).
-          If `x` is a dataset, generator or
-          `keras.utils.Sequence` instance, `y` should not be specified (since
-          targets will be obtained from the iterator/dataset).
-        batch_size: Integer or `None`.
-            Number of samples per gradient update.
-            If unspecified, `batch_size` will default to 32.
-            Do not specify the `batch_size` if your data is in the
-            form of symbolic tensors, dataset,
-            generators, or `keras.utils.Sequence` instances (since they generate
-            batches).
-        verbose: 0 or 1. Verbosity mode.
-            0 = silent, 1 = progress bar.
-        sample_weight: Optional Numpy array of weights for
-            the test samples, used for weighting the loss function.
-            You can either pass a flat (1D)
-            Numpy array with the same length as the input samples
-            (1:1 mapping between weights and samples),
-            or in the case of temporal data,
-            you can pass a 2D array with shape
-            `(samples, sequence_length)`,
-            to apply a different weight to every timestep of every sample.
-            In this case you should make sure to specify
-            `sample_weight_mode="temporal"` in `compile()`. This argument is not
-            supported when `x` is a dataset, instead pass
-            sample weights as the third element of `x`.
-        steps: Integer or `None`.
-            Total number of steps (batches of samples)
-            before declaring the evaluation round finished.
-            Ignored with the default value of `None`.
-            If x is a `tf.data` dataset and `steps` is
-            None, 'evaluate' will run until the dataset is exhausted.
-            This argument is not supported with array inputs.
-        callbacks: List of `keras.callbacks.Callback` instances.
-            List of callbacks to apply during evaluation.
-            See [callbacks](/api_docs/python/tf/keras/callbacks).
+        x: Input data. It could be: - A Numpy array (or array-like), or a list
+          of arrays (in case the model has multiple inputs). - A TensorFlow
+          tensor, or a list of tensors (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors, if
+          the model has named inputs. - A `tf.data` dataset. - A generator or
+          `keras.utils.Sequence` instance. A more detailed description of
+          unpacking behavior for iterator types (Dataset, generator, Sequence)
+          is given in the `Unpacking behavior for iterator-like inputs` section
+          of `Model.fit`.
+        y: Target data. Like the input data `x`, it could be either Numpy
+          array(s) or TensorFlow tensor(s). It should be consistent with `x`
+          (you cannot have Numpy inputs and tensor targets, or inversely). If
+          `x` is a dataset, generator or `keras.utils.Sequence` instance, `y`
+          should not be specified (since targets will be obtained from the
+          iterator/dataset).
+        batch_size: Integer or `None`. Number of samples per gradient update. If
+          unspecified, `batch_size` will default to 32. Do not specify the
+          `batch_size` if your data is in the form of a dataset, generators,
+          or `keras.utils.Sequence` instances (since they generate batches).
+        verbose: 0 or 1. Verbosity mode. 0 = silent, 1 = progress bar.
+        sample_weight: Optional Numpy array of weights for the test samples,
+          used for weighting the loss function. You can either pass a flat (1D)
+          Numpy array with the same length as the input samples
+            (1:1 mapping between weights and samples), or in the case of
+              temporal data, you can pass a 2D array with shape `(samples,
+              sequence_length)`, to apply a different weight to every timestep
+              of every sample. In this case you should make sure to specify
+              `sample_weight_mode="temporal"` in `compile()`. This argument is
+              not supported when `x` is a dataset, instead pass sample weights
+              as the third element of `x`.
+        steps: Integer or `None`. Total number of steps (batches of samples)
+          before declaring the evaluation round finished. Ignored with the
+          default value of `None`. If x is a `tf.data` dataset and `steps` is
+          None, 'evaluate' will run until the dataset is exhausted. This
+          argument is not supported with array inputs.
+        callbacks: List of `keras.callbacks.Callback` instances. List of
+          callbacks to apply during evaluation. See
+          [callbacks](/api_docs/python/tf/keras/callbacks).
         max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
-            input only. Maximum size for the generator queue.
-            If unspecified, `max_queue_size` will default to 10.
+          input only. Maximum size for the generator queue. If unspecified,
+          `max_queue_size` will default to 10.
         workers: Integer. Used for generator or `keras.utils.Sequence` input
-            only. Maximum number of processes to spin up when using
-            process-based threading. If unspecified, `workers` will default
-            to 1. If 0, will execute the generator on the main thread.
+          only. Maximum number of processes to spin up when using process-based
+          threading. If unspecified, `workers` will default to 1. If 0, will
+          execute the generator on the main thread.
         use_multiprocessing: Boolean. Used for generator or
-            `keras.utils.Sequence` input only. If `True`, use process-based
-            threading. If unspecified, `use_multiprocessing` will default to
-            `False`. Note that because this implementation relies on
-            multiprocessing, you should not pass non-picklable arguments to
-            the generator as they can't be passed easily to children processes.
+          `keras.utils.Sequence` input only. If `True`, use process-based
+          threading. If unspecified, `use_multiprocessing` will default to
+          `False`. Note that because this implementation relies on
+          multiprocessing, you should not pass non-picklable arguments to the
+          generator as they can't be passed easily to children processes.
+        return_dict: If `True`, loss and metric results are returned as a dict,
+          with each key being the name of the metric. If `False`, they are
+          returned as a list.
 
     See the discussion of `Unpacking behavior for iterator-like inputs` for
     `Model.fit`.
@@ -815,20 +950,112 @@ class Model(network.Network, version_utils.ModelVersionSelector):
     self._assert_compile_was_called()
     self._check_call_args('evaluate')
 
-    func = self._select_training_loop(x)
-    return func.evaluate(
-        self,
-        x=x,
-        y=y,
-        batch_size=batch_size,
-        verbose=verbose,
-        sample_weight=sample_weight,
-        steps=steps,
-        callbacks=callbacks,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing)
+    with self.distribute_strategy.scope():
+      # Creates a `tf.data.Dataset` and handles batch and epoch iteration.
+      data_handler = data_adapter.DataHandler(
+          x=x,
+          y=y,
+          sample_weight=sample_weight,
+          batch_size=batch_size,
+          steps_per_epoch=steps,
+          initial_epoch=0,
+          epochs=1,
+          max_queue_size=max_queue_size,
+          workers=workers,
+          use_multiprocessing=use_multiprocessing,
+          model=self)
 
+      # Container that configures and calls `tf.keras.Callback`s.
+      if not isinstance(callbacks, callbacks_module.CallbackList):
+        callbacks = callbacks_module.CallbackList(
+            callbacks,
+            add_history=True,
+            add_progbar=True,
+            model=self,
+            verbose=verbose,
+            epochs=1,
+            steps=data_handler._steps)  # pylint: disable=protected-access
+
+      test_function = self._make_test_function()
+      callbacks.on_test_begin()
+      for _, iterator in data_handler.enumerate_epochs():  # Single epoch.
+        self.reset_metrics()
+        with data_handler.catch_stop_iteration():
+          for step in data_handler.steps():
+            callbacks.on_test_batch_begin(step)
+            logs = test_function(iterator)
+            callbacks.on_test_batch_end(step, logs)
+      callbacks.on_test_end()
+
+      if return_dict:
+        return {m.name: m.result().numpy() for m in self.metrics}
+      else:
+        results = [m.result().numpy() for m in self.metrics]
+        if len(results) == 1:
+          return results[0]
+        return results
+
+  def _predict_step(self, data):
+    """The logic for one inference step.
+
+    This method can be overridden to support custom inference logic.
+    This method is called by `Model._make_predict_function`.
+
+    This method should contain the mathemetical logic for one step of inference.
+    This typically includes the forward pass.
+
+    Configuration details for *how* this logic is run (e.g. `tf.function` and
+    `tf.distribute.Strategy` settings), should be left to
+    `Model._make_predict_function`, which can also be overridden.
+
+    Arguments:
+      data: A nested structure of `Tensor`s.
+
+    Returns:
+      The result of one inference step, typically the output of calling the
+      `Model` on data.
+    """
+    data = data_adapter.expand_1d(data)
+    x, _, _ = data_adapter.unpack_x_y_sample_weight(data)
+    return self(x, training=False)
+
+  def _make_predict_function(self):
+    """Creates a function that executes one step of inference.
+
+    This method can be overridden to support custom inference logic.
+    This method is called by `Model.predict` and `Model.predict_on_batch`.
+
+    Typically, this method directly controls `tf.function` and
+    `tf.distribute.Strategy` settings, and delegates the actual evaluation
+    logic to `Model._predict_step`.
+
+    This function is cached the first time `Model.predict` or
+    `Model.predict_on_batch` is called. The cache is cleared whenever
+    `Model.compile` is called.
+
+    Returns:
+      Function. The function created by this method should accept a
+      `tf.data.Iterator`, and return the outputs of the `Model`.
+    """
+    if self.predict_function is not None:
+      return self.predict_function
+
+    def predict_function(iterator):
+      data = next(iterator)
+      outputs = self.distribute_strategy.experimental_run_v2(
+          self._predict_step, args=(data,))
+      outputs = reduce_per_replica(
+          outputs, self.distribute_strategy, reduction='concat')
+      return outputs
+
+    if not self.run_eagerly:
+      predict_function = def_function.function(
+          predict_function, experimental_relax_shapes=True)
+
+    self.predict_function = predict_function
+    return self.predict_function
+
+  @disable_multi_worker
   def predict(self,
               x,
               batch_size=None,
@@ -862,9 +1089,8 @@ class Model(network.Network, version_utils.ModelVersionSelector):
             Number of samples per batch.
             If unspecified, `batch_size` will default to 32.
             Do not specify the `batch_size` if your data is in the
-            form of symbolic tensors, dataset,
-            generators, or `keras.utils.Sequence` instances (since they generate
-            batches).
+            form of dataset, generators, or `keras.utils.Sequence` instances
+            (since they generate batches).
         verbose: Verbosity mode, 0 or 1.
         steps: Total number of steps (batches of samples)
             before declaring the prediction round finished.
@@ -906,22 +1132,53 @@ class Model(network.Network, version_utils.ModelVersionSelector):
     version_utils.disallow_legacy_graph('Model', 'predict')
     self._check_call_args('predict')
 
-    func = self._select_training_loop(x)
-    return func.predict(
-        self,
-        x=x,
-        batch_size=batch_size,
-        verbose=verbose,
-        steps=steps,
-        callbacks=callbacks,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing)
+    outputs = None
+    with self.distribute_strategy.scope():
+      # Creates a `tf.data.Dataset` and handles batch and epoch iteration.
+      data_handler = data_adapter.DataHandler(
+          x=x,
+          batch_size=batch_size,
+          steps_per_epoch=steps,
+          initial_epoch=0,
+          epochs=1,
+          max_queue_size=max_queue_size,
+          workers=workers,
+          use_multiprocessing=use_multiprocessing,
+          model=self)
+
+      # Container that configures and calls `tf.keras.Callback`s.
+      callbacks = callbacks_module.CallbackList(
+          callbacks,
+          add_history=True,
+          add_progbar=True,
+          model=self,
+          verbose=verbose,
+          epochs=1,
+          steps=data_handler._steps)  # pylint: disable=protected-access
+
+      predict_function = self._make_predict_function()
+      callbacks.on_predict_begin()
+      for _, iterator in data_handler.enumerate_epochs():  # Single epoch.
+        with data_handler.catch_stop_iteration():
+          for step in data_handler.steps():
+            callbacks.on_predict_batch_begin(step)
+            batch_outputs = predict_function(iterator)
+            if outputs is None:
+              outputs = nest.map_structure(lambda batch_output: [batch_output],
+                                           batch_outputs)
+            else:
+              nest.map_structure_up_to(
+                  batch_outputs,
+                  lambda output, batch_output: output.append(batch_output),
+                  outputs, batch_outputs)
+            callbacks.on_predict_batch_end(step, {'outputs': batch_outputs})
+      callbacks.on_predict_end()
+    all_outputs = nest.map_structure_up_to(batch_outputs, concat, outputs)
+    return to_numpy(all_outputs)
 
   def reset_metrics(self):
     """Resets the state of metrics."""
-    metrics = self._get_training_eval_metrics()
-    for m in metrics:
+    for m in self.metrics:
       m.reset_states()
 
   def train_on_batch(self,
@@ -940,19 +1197,15 @@ class Model(network.Network, version_utils.ModelVersionSelector):
               (in case the model has multiple inputs).
           - A dict mapping input names to the corresponding array/tensors,
               if the model has named inputs.
-          - A `tf.data` dataset.
         y: Target data. Like the input data `x`, it could be either Numpy
           array(s) or TensorFlow tensor(s). It should be consistent with `x`
-          (you cannot have Numpy inputs and tensor targets, or inversely). If
-          `x` is a dataset, `y` should not be specified
-          (since targets will be obtained from the iterator).
+          (you cannot have Numpy inputs and tensor targets, or inversely).
         sample_weight: Optional array of the same length as x, containing
           weights to apply to the model's loss for each sample. In the case of
           temporal data, you can pass a 2D array with shape (samples,
           sequence_length), to apply a different weight to every timestep of
           every sample. In this case you should make sure to specify
-          sample_weight_mode="temporal" in compile(). This argument is not
-          supported when `x` is a dataset.
+          sample_weight_mode="temporal" in compile().
         class_weight: Optional dictionary mapping class indices (integers) to a
           weight (float) to apply to the model's loss for the samples from this
           class during training. This can be useful to tell the model to "pay
@@ -973,46 +1226,38 @@ class Model(network.Network, version_utils.ModelVersionSelector):
     """
     self._assert_compile_was_called()
     self._check_call_args('train_on_batch')
-    outputs = training_v2_utils.train_on_batch(
-        self,
-        x,
-        y=y,
-        sample_weight=sample_weight,
-        class_weight=class_weight,
-        reset_metrics=reset_metrics,
-        standalone=True)
-    outputs = (
-        outputs['total_loss'] + outputs['output_losses'] + outputs['metrics'])
-    outputs = [training_v2_utils._non_none_constant_value(v) for v in outputs]  # pylint: disable=protected-access
-    if len(outputs) == 1:
-      outputs = outputs[0]
-    return outputs
+    with self.distribute_strategy.scope(), \
+         training_utils.RespectCompiledTrainableState(self):
+      iterator = data_adapter.single_batch_iterator(self.distribute_strategy, x,
+                                                    y, sample_weight,
+                                                    class_weight)
+      train_function = self._make_train_function()
+      train_function(iterator)
+    metrics = [m.result().numpy() for m in self.metrics]
+    if reset_metrics:
+      self.reset_metrics()
+    if len(metrics) == 1:
+      return metrics[0]
+    return metrics
 
   def test_on_batch(self, x, y=None, sample_weight=None, reset_metrics=True):
     """Test the model on a single batch of samples.
 
     Arguments:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
-          - A dict mapping input names to the corresponding array/tensors,
-            if the model has named inputs.
-          - A `tf.data` dataset.
-        y: Target data. Like the input data `x`,
-          it could be either Numpy array(s) or TensorFlow tensor(s).
-          It should be consistent with `x` (you cannot have Numpy inputs and
-          tensor targets, or inversely). If `x` is a dataset `y` should
-          not be specified (since targets will be obtained from the iterator).
+        x: Input data. It could be: - A Numpy array (or array-like), or a list
+          of arrays (in case the model has multiple inputs). - A TensorFlow
+          tensor, or a list of tensors (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors, if
+          the model has named inputs.
+        y: Target data. Like the input data `x`, it could be either Numpy
+          array(s) or TensorFlow tensor(s). It should be consistent with `x`
+          (you cannot have Numpy inputs and tensor targets, or inversely).
         sample_weight: Optional array of the same length as x, containing
-            weights to apply to the model's loss for each sample.
-            In the case of temporal data, you can pass a 2D array
-            with shape (samples, sequence_length),
-            to apply a different weight to every timestep of every sample.
-            In this case you should make sure to specify
-            sample_weight_mode="temporal" in compile(). This argument is not
-            supported when `x` is a dataset.
+          weights to apply to the model's loss for each sample. In the case of
+          temporal data, you can pass a 2D array with shape (samples,
+          sequence_length), to apply a different weight to every timestep of
+          every sample. In this case you should make sure to specify
+          sample_weight_mode="temporal" in compile().
         reset_metrics: If `True`, the metrics returned will be only for this
           batch. If `False`, the metrics will be statefully accumulated across
           batches.
@@ -1028,30 +1273,25 @@ class Model(network.Network, version_utils.ModelVersionSelector):
     """
     self._assert_compile_was_called()
     self._check_call_args('test_on_batch')
-    outputs = training_v2_utils.test_on_batch(
-        self,
-        x,
-        y=y,
-        sample_weight=sample_weight,
-        reset_metrics=reset_metrics,
-        standalone=True)
-    outputs = (
-        outputs['total_loss'] + outputs['output_losses'] + outputs['metrics'])
-    outputs = [training_v2_utils._non_none_constant_value(v) for v in outputs]  # pylint: disable=protected-access
-    if len(outputs) == 1:
-      outputs = outputs[0]
-    return outputs
+    with self.distribute_strategy.scope():
+      iterator = data_adapter.single_batch_iterator(self.distribute_strategy, x,
+                                                    y, sample_weight)
+      test_function = self._make_test_function()
+      test_function(iterator)
+    metrics = [m.result().numpy() for m in self.metrics]
+    if reset_metrics:
+      self.reset_metrics()
+    if len(metrics) == 1:
+      return metrics[0]
+    return metrics
 
   def predict_on_batch(self, x):
     """Returns predictions for a single batch of samples.
 
     Arguments:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
-          - A `tf.data` dataset.
+        x: Input data. It could be: - A Numpy array (or array-like), or a list
+          of arrays (in case the model has multiple inputs). - A TensorFlow
+          tensor, or a list of tensors (in case the model has multiple inputs).
 
     Returns:
         Numpy array(s) of predictions.
@@ -1061,7 +1301,11 @@ class Model(network.Network, version_utils.ModelVersionSelector):
           expectations of the model.
     """
     self._check_call_args('predict_on_batch')
-    return training_v2_utils.predict_on_batch(self, x, standalone=True)
+    with self.distribute_strategy.scope():
+      iterator = data_adapter.single_batch_iterator(self.distribute_strategy, x)
+      predict_function = self._make_predict_function()
+      outputs = predict_function(iterator)
+    return to_numpy(outputs)
 
   @deprecation.deprecated(
       None, 'Please use Model.fit, which supports generators.')
@@ -1176,54 +1420,11 @@ class Model(network.Network, version_utils.ModelVersionSelector):
           'and the first argument in `call` as positional arguments, '
           'found: ' + str(extra_args) + '.')
 
-  def _set_optimizer(self, optimizer):
-    """Sets self.optimizer.
-
-    Sets self.optimizer to `optimizer`, potentially wrapping it with a
-    LossScaleOptimizer.
-
-    Args:
-      optimizer: The optimizer(s) to assign to self.optimizer.
-    """
-    if isinstance(optimizer, (list, tuple)):
-      self.optimizer = [optimizers.get(opt) for opt in optimizer]
-    else:
-      self.optimizer = optimizers.get(optimizer)
-
-    if (self._dtype_policy.loss_scale is not None and
-        not isinstance(self.optimizer,
-                       loss_scale_optimizer.LossScaleOptimizer)):
-      if isinstance(self.optimizer, list):
-        raise ValueError('When a dtype policy with a loss scale is used, you '
-                         'can only pass a single optimizer. Using policy %s '
-                         'and got optimizers: %s' %
-                         self._dtype_policy, self.optimizer)
-      if not isinstance(self.optimizer, optimizer_v2.OptimizerV2):
-        raise ValueError('"optimizer" must be an instance of '
-                         'tf.keras.optimizers.Optimizer when a dype policy '
-                         'with a loss scale  used, but got: %s. Using policy: '
-                         '%s' %
-                         (self.optimizer, self._dtype_policy))
-      self.optimizer = loss_scale_optimizer.LossScaleOptimizer(
-          self.optimizer, self._dtype_policy.loss_scale)
-    if (isinstance(self.optimizer, loss_scale_optimizer.LossScaleOptimizer) and
-        self._dtype_policy.loss_scale and
-        self.optimizer.loss_scale != self._dtype_policy.loss_scale):
-      logging.warning('LossScale of LossScaleOptimizer passed to compile (%s) '
-                      'is not the same as the dtype policy\'s loss scale (%s). '
-                      'Because the dtype policy has a loss scale, you should '
-                      'pass an optimizer that is not wrapped with a '
-                      'LossScaleOptimizer,'
-                      % (self.optimizer.loss_scale,
-                         self._dtype_policy.loss_scale))
-
   def _validate_compile(self, optimizer, **kwargs):
     """Performs validation checks for the default `compile`."""
-    is_any_keras_optimizer_v1 = any(
-        (isinstance(opt, optimizers.Optimizer) and
-         not isinstance(opt, optimizers.TFOptimizer))
-        for opt in nest.flatten(optimizer))
-    if is_any_keras_optimizer_v1:
+    if any(
+        isinstance(opt, optimizers.Optimizer)
+        for opt in nest.flatten(optimizer)):
       raise ValueError(
           '`tf.compat.v1.keras` Optimizer (', optimizer, ') is '
           'not supported when eager execution is enabled. Use a '
@@ -1259,1331 +1460,7 @@ class Model(network.Network, version_utils.ModelVersionSelector):
               '  model=_create_model()\n'
               '  model.compile(...)' % (v, strategy))
 
-  def _prepare_validation_data(self, validation_data, batch_size,
-                               validation_steps):
-    """Unpack and check the validation data."""
-    val_x, val_y, val_sample_weights = training_utils.unpack_validation_data(
-        validation_data)
-    return self._standardize_user_data(
-        val_x,
-        val_y,
-        sample_weight=val_sample_weights,
-        batch_size=batch_size,
-        steps=validation_steps,
-        steps_name='validation_steps')
-
-  def _process_target_tensor_for_compile(self, target_tensors):
-    if self.run_eagerly:
-      # target tensor is not supported with run_eagerly. Create a list with None
-      # as placeholder for each output.
-      return [None for _ in self.output_names]
-
-    if target_tensors is not None and not (isinstance(target_tensors, list) and
-                                           target_tensors == []):  # pylint: disable=g-explicit-bool-comparison
-      if isinstance(target_tensors, list):
-        if len(target_tensors) != len(self.outputs):
-          raise ValueError(
-              'When passing a list as `target_tensors`, '
-              'it should have one entry per model output. '
-              'The model has %s outputs, but you passed target_tensors=%s' %
-              (len(self.outputs), target_tensors))
-      elif isinstance(target_tensors, dict):
-        unexpected_target_tensor_names = set(target_tensors.keys()).difference(
-            self.output_names)
-        if unexpected_target_tensor_names:
-          raise ValueError(
-              'Unknown entry in `target_tensors` dictionary: "{name}". '
-              'Only expected the following keys: {keys}'.format(
-                  name=unexpected_target_tensor_names,
-                  keys=str(self.output_names)))
-        tmp_target_tensors = []
-        for name in self.output_names:
-          tmp_target_tensors.append(target_tensors.get(name, None))
-        target_tensors = tmp_target_tensors
-      elif tensor_util.is_tensor(target_tensors):
-        target_tensors = [target_tensors]
-      else:
-        raise TypeError('Expected `target_tensors` to be a list or tuple or '
-                        'dict or a single tensor, but got:', target_tensors)
-    else:
-      # In case target tensor is empty or None, create a list with Nones
-      # that has same length as self.output_names. With that, the None check of
-      # target tensor can be skipped downstream.
-      target_tensors = [None for _ in self.output_names]
-    return target_tensors
-
-  def _compile_eagerly(self, metrics, weighted_metrics, sample_weight_mode):
-    # Prepare sample weight modes. List with the same length as model outputs.
-    training_utils.prepare_sample_weight_modes(
-        self._training_endpoints, sample_weight_mode)
-    # Prepare sample weights.
-    self._prepare_sample_weights()
-    # Save all metric attributes per output of the model.
-    self._cache_output_metric_attributes(metrics, weighted_metrics)
-    self.total_loss = None
-    # Set metric attributes on model.
-    self._set_metric_attributes()
-
-    self._collected_trainable_weights = self.trainable_weights
-
-  def _update_sample_weight_modes(self, sample_weights=None):
-    """Updates sample weight modes based on training/eval inputs.
-
-    Sample weight placeholders will be created for all or no outputs
-    based on whether sample_weight is provided for any output.
-
-    If model contains `_sample_weight_modes` we check if the input
-    `sample_weights` corresponds to the sample weight modes.
-      1. Set sample weight mode to be 'temporal' for output i, if `compile`
-        sample_weight_mode was set to `temporal` and sample weight inputs
-        are given for one or more outputs.
-      2. Set sample weight mode to be 'samplewise' for output i, if `compile`
-        sample_weight_mode was not set and sample weight inputs are given for
-        one or more outputs.
-      3. Reset sample weight mode to None for output i if sample weight mode
-        was set but there is no sample weight input.
-
-    Args:
-      sample_weights: List of sample weights of the same length as model outputs
-        or None.
-    """
-    if not self._is_compiled:
-      return
-    if sample_weights and any(s is not None for s in sample_weights):
-      for endpoint in self._training_endpoints:
-        endpoint.sample_weight_mode = (
-            endpoint.sample_weight_mode or 'samplewise')
-    else:
-      for endpoint in self._training_endpoints:
-        endpoint.sample_weight_mode = None
-
-  def _recompile_weights_loss_and_weighted_metrics(self):
-    if not self._is_compiled:
-      return False
-    recompile = any(
-        e.sample_weights_mismatch() for e in self._training_endpoints)
-
-    if recompile:
-      self._compile_weights_loss_and_weighted_metrics()
-    return recompile
-
-  @trackable.no_automatic_dependency_tracking
-  def _compile_weights_loss_and_weighted_metrics(self, sample_weights=None):
-    """Compiles the model loss and weighted metric sub-graphs.
-
-    This may be used to set graph tensors as sample weights (instead of creating
-    placeholders). This functionality is necessary for
-    `tf.keras.estimator.model_to_estimator`, which calls Keras models in a v1
-    graph, and creates iterator tensors for inputs, targets, and sample weights.
-
-    Args:
-      sample_weights: List of tensors to use as the sample weights. Must be the
-        same length as the number of outputs. If left as `None`, placeholders
-        are used instead.
-    """
-    with K.get_graph().as_default():
-      if sample_weights is not None:
-        self._update_sample_weight_modes(sample_weights)
-      self._prepare_sample_weights(sample_weights)
-
-      masks = self._prepare_output_masks()
-
-      # Compute weighted metrics.
-      self._handle_metrics(
-          self.outputs,
-          targets=self._targets,
-          skip_target_masks=self._prepare_skip_target_masks(),
-          sample_weights=self.sample_weights,
-          masks=masks,
-          return_weighted_metrics=True)
-
-      # Compute total loss.
-      # Used to keep track of the total loss value (stateless).
-      # eg., total_loss = loss_weight_1 * output_1_loss_fn(...) +
-      #                   loss_weight_2 * output_2_loss_fn(...) +
-      #                   layer losses.
-      self.total_loss = self._prepare_total_loss(masks)
-
-  def _prepare_skip_target_masks(self):
-    """Boolean mask for whether the target in the output list should be skipped.
-
-    If the loss function corresponding to a model output is None, then this
-    output will be skipped during total loss calculation and feed targets
-    preparation.
-
-    Returns:
-      A boolean list for whether the corresponding target in the output list
-      should be skipped during loss calculation.
-    """
-    return [l is None for l in self.loss_functions]
-
-  def _prepare_output_masks(self):
-    """Returns masks corresponding to model outputs."""
-    return [getattr(x, '_keras_mask', None) for x in self.outputs]
-
-  def _prepare_total_loss(self, masks):
-    """Computes total loss from loss functions.
-
-    Arguments:
-        masks: List of mask values corresponding to each model output.
-
-    Returns:
-        A list of loss weights of python floats.
-
-    Raises:
-        TypeError: If model run_eagerly is True.
-    """
-    if self.run_eagerly:
-      raise TypeError('total loss can not be computed when compiled with '
-                      'run_eagerly = True.')
-    total_loss = None
-    with K.name_scope('loss'):
-      for endpoint, mask in zip(self._training_endpoints, masks):
-        if endpoint.should_skip_target():
-          continue
-        y_true = endpoint.training_target.target
-        y_pred = endpoint.output
-        loss_fn = endpoint.loss_fn
-        loss_weight = endpoint.loss_weight
-        loss_name = endpoint.loss_name()
-        sample_weight = endpoint.sample_weight
-
-        with K.name_scope(loss_name):
-          if mask is not None:
-            mask = math_ops.cast(mask, y_pred.dtype)
-            # Update weights with mask.
-            if sample_weight is None:
-              sample_weight = mask
-            else:
-              # Update dimensions of weights to match with mask if possible.
-              mask, _, sample_weight = (
-                  tf_losses_utils.squeeze_or_expand_dimensions(
-                      mask, sample_weight=sample_weight))
-              sample_weight *= mask
-
-          if hasattr(loss_fn, 'reduction'):
-            per_sample_losses = loss_fn.call(y_true, y_pred)
-            weighted_losses = losses_utils.compute_weighted_loss(
-                per_sample_losses,
-                sample_weight=sample_weight,
-                reduction=losses_utils.ReductionV2.NONE)
-            loss_reduction = loss_fn.reduction
-
-            # `AUTO` loss reduction defaults to `SUM_OVER_BATCH_SIZE` for all
-            # compile use cases.
-            if loss_reduction == losses_utils.ReductionV2.AUTO:
-              loss_reduction = losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE
-
-            # Compute the stateless loss value.
-            output_loss = losses_utils.reduce_weighted_loss(
-                weighted_losses, reduction=loss_reduction)
-          else:
-            # Compute the stateless loss value for a custom loss class.
-            # Here we assume that the class takes care of loss reduction
-            # because if this class returns a vector value we cannot
-            # differentiate between use case where a custom optimizer
-            # expects a vector loss value vs unreduced per-sample loss value.
-            output_loss = loss_fn(y_true, y_pred, sample_weight=sample_weight)
-            loss_reduction = losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE
-
-        if len(self.outputs) > 1:
-          # Keep track of stateful result tensor for the loss.
-          endpoint.output_loss_metric(output_loss)
-
-        # Scale output loss for distribution. For custom losses we assume
-        # reduction was mean.
-        if loss_reduction == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE:
-          output_loss = losses_utils.scale_loss_for_distribution(output_loss)
-
-        if total_loss is None:
-          total_loss = loss_weight * output_loss
-        else:
-          total_loss += loss_weight * output_loss
-      if total_loss is None:
-        if not self.losses:
-          raise ValueError('The model cannot be compiled '
-                           'because it has no loss to optimize.')
-        else:
-          total_loss = 0.
-
-      # Add regularization penalties and other layer-specific losses.
-      custom_losses = self.get_losses_for(None) + self.get_losses_for(
-          self.inputs)
-      if custom_losses:
-        total_loss += losses_utils.scale_loss_for_distribution(
-            math_ops.add_n(custom_losses))
-    return total_loss
-
-  def _get_callback_model(self):
-    """Returns the Callback Model for this Model."""
-
-    if hasattr(self, '_replicated_model') and self._replicated_model:
-      # When using training_distributed, we set the callback model
-      # to an instance of the `DistributedModel` that we create in
-      # the `compile` call. The `DistributedModel` is initialized
-      # with the first replicated model. We need to set the callback
-      # model to a DistributedModel to allow us to override saving
-      # and loading weights when we checkpoint the model during training.
-      return self._replicated_model
-    if hasattr(self, 'callback_model') and self.callback_model:
-      return self.callback_model
-    return self
-
-  def _validate_or_infer_batch_size(self, batch_size, steps, x):
-    """Validates that the `batch_size` provided is consistent with InputLayer.
-
-    It's possible that the user specified a static batch size in their
-    InputLayer. If so, this method checks the provided `batch_size` and `x`
-    arguments are consistent with this static batch size. Also, if
-    `batch_size` is `None`, this method will attempt to infer the batch size
-    from the static batch size of the InputLayer. Lastly, ValueError will be
-    raised if `x` is a tf.data.Dataset and `batch_size` is specified as we
-    expect users to provide batched datasets.
-
-    Arguments:
-      batch_size: The batch_size provided as an argument to
-        fit/evaluate/predict.
-      steps: The steps provided as an argument to fit/evaluate/predict.
-      x: The data passed as `x` to fit/evaluate/predict.
-
-    Returns:
-      The validated batch_size, auto-inferred from the first layer if not
-      provided.
-    """
-    if (isinstance(x, (dataset_ops.DatasetV1,
-                       dataset_ops.DatasetV2,
-                       data_utils.Sequence)) or
-        tf_inspect.isgenerator(x)):
-      if batch_size is not None:
-        raise ValueError(
-            'The `batch_size` argument must not be specified for the given '
-            'input type. Received input: {}, batch_size: {}'.format(
-                x, batch_size))
-      return
-
-    # Avoids the override in Sequential.layers which filters Input layers.
-    # (Which are often the very layers that we're after.)
-    layers = trackable_layer_utils.filter_empty_layer_containers(self._layers)
-    first_layer = next(layers, None)
-    if first_layer:
-      # The per-replica static batch size.
-      static_batch_size = training_utils.get_static_batch_size(first_layer)
-      if static_batch_size is not None:
-
-        # Determine number of times the user-supplied batch size will be split.
-        if (self._distribution_strategy and
-            distributed_training_utils.global_batch_size_supported(
-                self._distribution_strategy)):
-          num_splits_for_ds = self._distribution_strategy.num_replicas_in_sync
-        else:
-          num_splits_for_ds = 1
-
-        # Check `batch_size` argument is consistent with InputLayer.
-        if batch_size is not None:
-          if batch_size % num_splits_for_ds != 0:
-            raise ValueError('The `batch_size` argument ({}) must be divisible '
-                             'the by number of replicas ({})'.format(
-                                 batch_size, num_splits_for_ds))
-          per_replica_batch_size = batch_size // num_splits_for_ds
-
-          if per_replica_batch_size != static_batch_size:
-            raise ValueError('The `batch_size` argument value {} is '
-                             'incompatible with the specified batch size of '
-                             'your Input Layer: {}'.format(
-                                 per_replica_batch_size, static_batch_size))
-
-        # Check Dataset/Iterator batch size is consistent with InputLayer.
-        if isinstance(x, (dataset_ops.DatasetV2, iterator_ops.Iterator,
-                          iterator_ops.OwnedIterator)):
-          ds_batch_size = tensor_shape.as_dimension(
-              nest.flatten(dataset_ops.get_legacy_output_shapes(x))[0][0]).value
-          if ds_batch_size is not None:
-            if ds_batch_size % num_splits_for_ds != 0:
-              raise ValueError(
-                  'The batch output shape of your `Dataset` {} '
-                  'cannot be divisible by number of replicas {}'.format(
-                      ds_batch_size, num_splits_for_ds))
-
-            ds_per_replica_batch_size = ds_batch_size // num_splits_for_ds
-            if ds_per_replica_batch_size != static_batch_size:
-              raise ValueError('The batch output shape of your `Dataset` is '
-                               '{}, which is incompatible with the specified '
-                               'batch size of your Input Layer: {}'.format(
-                                   ds_per_replica_batch_size,
-                                   static_batch_size))
-
-        # Set inferred batch size from the InputLayer.
-        if steps is None:
-          batch_size = static_batch_size * num_splits_for_ds
-
-    if batch_size is None and steps is None:
-      # Backwards compatibility
-      batch_size = 32
-    return batch_size
-
-  def _prepare_sample_weights(self, sample_weights=None):
-    """Sets sample weight attribute on the model."""
-    # List with the same length as model outputs.
-    if sample_weights is not None:
-      if len(sample_weights) != len(self._training_endpoints):
-        raise ValueError('Provided sample weights must have same length as the '
-                         'number of outputs. Expected: {}, got: {}.'.format(
-                             len(self._training_endpoints),
-                             len(sample_weights)))
-    else:
-      sample_weights = [None] * len(self._training_endpoints)
-    for endpoint, weight in zip(self._training_endpoints, sample_weights):
-      endpoint.populate_sample_weight(weight, endpoint.sample_weight_mode)
-
-  def _cache_output_metric_attributes(self, metrics, weighted_metrics):
-    """Caches metric name and function attributes for every model output."""
-    output_shapes = []
-    for output in self.outputs:
-      if output is None or output.shape.rank is None:
-        output_shapes.append(None)
-      else:
-        output_shapes.append(output.shape.as_list())
-    self._per_output_metrics = training_utils.collect_per_output_metric_info(
-        metrics, self.output_names, output_shapes, self.loss_functions)
-    self._per_output_weighted_metrics = (
-        training_utils.collect_per_output_metric_info(
-            weighted_metrics,
-            self.output_names,
-            output_shapes,
-            self.loss_functions,
-            is_weighted=True))
-
-  def _add_unique_metric_name(self, metric_name, output_index):
-    """Makes the metric name unique and adds it to the model's metric name list.
-
-      If there are multiple outputs for which the metrics are calculated, the
-      metric names have to be made unique by appending an integer.
-
-    Arguments:
-      metric_name: Metric name that corresponds to the metric specified by the
-          user. For example: 'acc'.
-      output_index: The index of the model output for which the metric name is
-        being added.
-
-    Returns:
-      string, name of the model's unique metric name
-    """
-    if len(self.output_names) > 1:
-      metric_name = '%s_%s' % (self.output_names[output_index], metric_name)
-    j = 1
-    base_metric_name = metric_name
-    while metric_name in self.metrics_names:
-      metric_name = '%s_%d' % (base_metric_name, j)
-      j += 1
-
-    return metric_name
-
-  def _init_metric_attributes(self):
-    """Initialized model metric attributes."""
-    # List of stateful metric functions. Used for resetting metric state during
-    # training/eval.
-    self._compile_metric_functions = []
-
-  def _set_per_output_metric_attributes(self, metrics_dict, output_index):
-    """Sets the metric attributes on the model for the given output.
-
-    Arguments:
-      metrics_dict: A dict with metric names as keys and metric fns as values.
-      output_index: The index of the model output for which the metric
-        attributes are added.
-
-    Returns:
-      Metrics dict updated with unique metric names as keys.
-    """
-    updated_metrics_dict = collections.OrderedDict()
-    for metric_name, metric_fn in metrics_dict.items():
-      metric_name = self._add_unique_metric_name(metric_name, output_index)
-
-      # Update the name on the metric class to be the unique generated name.
-      metric_fn._name = metric_name  # pylint: disable=protected-access
-      updated_metrics_dict[metric_name] = metric_fn
-      # Keep track of metric name and function.
-      self._compile_metric_functions.append(metric_fn)
-    return updated_metrics_dict
-
-  def _set_metric_attributes(self):
-    """Sets the metric attributes on the model for all the model outputs."""
-    updated_per_output_metrics = []
-    updated_per_output_weighted_metrics = []
-    for i, endpoint in enumerate(self._training_endpoints):
-      if endpoint.should_skip_target():
-        updated_per_output_metrics.append(self._per_output_metrics[i])
-        updated_per_output_weighted_metrics.append(
-            self._per_output_weighted_metrics[i])
-        continue
-      updated_per_output_metrics.append(
-          self._set_per_output_metric_attributes(self._per_output_metrics[i],
-                                                 i))
-      updated_per_output_weighted_metrics.append(
-          self._set_per_output_metric_attributes(
-              self._per_output_weighted_metrics[i], i))
-
-    # Create a metric wrapper for each output loss. This computes mean of an
-    # output loss across mini-batches (irrespective of how we reduce within a
-    # batch).
-    if len(self._training_endpoints) > 1:
-      for endpoint in self._training_endpoints:
-        if not endpoint.should_skip_target():
-          endpoint.output_loss_metric = metrics_module.Mean(
-              name=endpoint.loss_name())
-
-    self._per_output_metrics = updated_per_output_metrics
-    self._per_output_weighted_metrics = updated_per_output_weighted_metrics
-
-  def _handle_per_output_metrics(self,
-                                 metrics_dict,
-                                 y_true,
-                                 y_pred,
-                                 mask,
-                                 weights=None):
-    """Calls metric functions for a single output.
-
-    Arguments:
-      metrics_dict: A dict with metric names as keys and metric fns as values.
-      y_true: Target output.
-      y_pred: Predicted output.
-      mask: Computed mask value for the current output.
-      weights: Weights to be applied on the current output.
-
-    Returns:
-      A list of metric result tensors.
-    """
-    metric_results = []
-    for metric_name, metric_fn in metrics_dict.items():
-      with K.name_scope(metric_name):
-        metric_result = training_utils.call_metric_function(
-            metric_fn, y_true, y_pred, weights=weights, mask=mask)
-        metric_results.append(metric_result)
-    return metric_results
-
-  def _handle_metrics(self,
-                      outputs,
-                      targets=None,
-                      skip_target_masks=None,
-                      sample_weights=None,
-                      masks=None,
-                      return_weighted_metrics=False,
-                      return_weighted_and_unweighted_metrics=False):
-    """Handles calling metric functions.
-
-    Arguments:
-      outputs: List of outputs (predictions).
-      targets: List of targets.
-      skip_target_masks: Optional. List of boolean for whether the corresponding
-        target should be ignored or not.
-      sample_weights: Optional list of sample weight arrays.
-      masks: List of computed output mask values.
-      return_weighted_metrics: Flag that indicates whether weighted metrics
-        should be computed instead of unweighted metrics. This flag is ignored
-        when `return_weighted_and_unweighted_metrics` is enabled.
-      return_weighted_and_unweighted_metrics: Flag that is used to indicate
-        whether both weighted and unweighted metrics should be computed. When
-        this is not enabled, we use `return_weighted_metrics` param to indicate
-        whether weighted or unweighted metrics should be returned.
-
-    Returns:
-      A list of metric result tensors.
-    """
-    # TODO(scottzhu): Update this to use the new training_endpoints. Currently
-    # the eager and graph logic is bit different.
-    skip_target_masks = skip_target_masks or [False] * len(outputs)
-    metric_results = []
-    with K.name_scope('metrics'):
-      # Invoke all metrics added using `compile`.
-      for i in range(len(outputs)):
-        if skip_target_masks[i]:
-          continue
-        output = outputs[i] if outputs else None
-        target = targets[i] if targets else None
-        output_mask = masks[i] if masks else None
-
-        if (return_weighted_and_unweighted_metrics or
-            not return_weighted_metrics):
-          metric_results.extend(
-              self._handle_per_output_metrics(self._per_output_metrics[i],
-                                              target, output, output_mask))
-        if return_weighted_and_unweighted_metrics or return_weighted_metrics:
-          metric_results.extend(
-              self._handle_per_output_metrics(
-                  self._per_output_weighted_metrics[i],
-                  target,
-                  output,
-                  output_mask,
-                  weights=sample_weights[i] if sample_weights else None))
-    return metric_results
-
-  def _check_trainable_weights_consistency(self):
-    """Check trainable weights count consistency.
-
-    This will raise a warning if `trainable_weights` and
-    `_collected_trainable_weights` are inconsistent (i.e. have different
-    number of parameters).
-    Inconsistency will typically arise when one modifies `model.trainable`
-    without calling `model.compile` again.
-    """
-    if not hasattr(self, '_collected_trainable_weights'):
-      return
-
-    if len(self.trainable_weights) != len(self._collected_trainable_weights):
-      logging.log_first_n(
-          logging.WARN, 'Discrepancy between trainable weights and collected'
-          ' trainable weights, did you set `model.trainable`'
-          ' without calling `model.compile` after ?', 1)
-
-  def _make_train_function(self):
-    has_recompiled = self._recompile_weights_loss_and_weighted_metrics()
-    self._check_trainable_weights_consistency()
-    if isinstance(self.optimizer, list):
-      raise ValueError('The `optimizer` in `compile` should be a single '
-                       'optimizer.')
-    # If we have re-compiled the loss/weighted metric sub-graphs then create
-    # train function even if one exists already. This is because
-    # `_feed_sample_weights` list has been updated on re-compile.
-    if getattr(self, 'train_function', None) is None or has_recompiled:
-      # Restore the compiled trainable state.
-      current_trainable_state = self._get_trainable_state()
-      self._set_trainable_state(self._compiled_trainable_state)
-
-      inputs = (self._feed_inputs +
-                self._feed_targets +
-                self._feed_sample_weights)
-      if not isinstance(K.symbolic_learning_phase(), int):
-        inputs += [K.symbolic_learning_phase()]
-
-      with K.get_graph().as_default():
-        with K.name_scope('training'):
-          # Training updates
-          updates = self.optimizer.get_updates(
-              params=self._collected_trainable_weights, loss=self.total_loss)
-          # Unconditional updates
-          updates += self.get_updates_for(None)
-          # Conditional updates relevant to this model
-          updates += self.get_updates_for(self.inputs)
-
-        metrics = self._get_training_eval_metrics()
-        metrics_tensors = [
-            m._call_result for m in metrics if hasattr(m, '_call_result')  # pylint: disable=protected-access
-        ]
-
-      with K.name_scope('training'):
-        # Gets loss and metrics. Updates weights at each call.
-        fn = K.function(
-            inputs, [self.total_loss] + metrics_tensors,
-            updates=updates,
-            name='train_function')
-        setattr(self, 'train_function', fn)
-
-      # Restore the current trainable state
-      self._set_trainable_state(current_trainable_state)
-
-  def _make_test_function(self):
-    has_recompiled = self._recompile_weights_loss_and_weighted_metrics()
-    # If we have re-compiled the loss/weighted metric sub-graphs then create
-    # test function even if one exists already. This is because
-    # `_feed_sample_weights` list has been updated on re-compile.
-    if getattr(self, 'test_function', None) is None or has_recompiled:
-      inputs = (self._feed_inputs +
-                self._feed_targets +
-                self._feed_sample_weights)
-
-      with K.get_graph().as_default():
-        metrics = self._get_training_eval_metrics()
-        metrics_tensors = [
-            m._call_result for m in metrics if hasattr(m, '_call_result')  # pylint: disable=protected-access
-        ]
-
-      with K.name_scope('evaluation'):
-        updates = self.state_updates
-        # Return loss and metrics, no gradient updates.
-        # Does update the network states.
-        fn = K.function(
-            inputs, [self.total_loss] + metrics_tensors,
-            updates=updates,
-            name='test_function')
-        setattr(self, 'test_function', fn)
-
-  def _make_predict_function(self):
-    if not hasattr(self, 'predict_function'):
-      self.predict_function = None
-    if self.predict_function is None:
-      inputs = self._feed_inputs
-      # Gets network outputs. Does not update weights.
-      # Does update the network states.
-      kwargs = getattr(self, '_function_kwargs', {})
-      with K.name_scope(ModeKeys.PREDICT):
-        self.predict_function = K.function(
-            inputs,
-            self.outputs,
-            updates=self.state_updates,
-            name='predict_function',
-            **kwargs)
-
-  def _make_execution_function(self, mode):
-    if mode == ModeKeys.TRAIN:
-      self._make_train_function()
-      return self.train_function
-    if mode == ModeKeys.TEST:
-      self._make_test_function()
-      return self.test_function
-    if mode == ModeKeys.PREDICT:
-      self._make_predict_function()
-      return self.predict_function
-
-  def _distribution_standardize_user_data(self,
-                                          x,
-                                          y=None,
-                                          sample_weight=None,
-                                          class_weight=None,
-                                          batch_size=None,
-                                          validation_split=0,
-                                          shuffle=False,
-                                          epochs=1,
-                                          allow_partial_batch=False):
-    """Runs validation checks on input and target data passed by the user.
-
-    This is called when using tf.distribute.Strategy to train, evaluate or serve
-    the model.
-
-    Args:
-      x: Input data. A numpy array or `tf.data` dataset.
-      y: Target data. A numpy array or None if x is a `tf.data` dataset.
-      sample_weight: An optional sample-weight array passed by the user to
-        weight the importance of each sample in `x`.
-      class_weight: An optional class-weight array by the user to
-        weight the importance of samples in `x` based on the class they belong
-        to, as conveyed by `y`.
-      batch_size: Integer batch size. If provided, it is used to run additional
-        validation checks on stateful models.
-      validation_split: Float between 0 and 1.
-        Fraction of the training data to be used as validation data.
-      shuffle: Boolean whether to shuffle the training data before each epoch.
-      epochs: Integer epochs. If > 1, repeat the numpy training data epochs
-        times when converting to training dataset.
-      allow_partial_batch: Boolean whether to enforce that all batches have the
-        same size.
-
-    Returns:
-      Dataset instance.
-
-    Raises:
-      ValueError: In case of invalid user-provided data.
-      RuntimeError: If the model was never compiled.
-    """
-    if class_weight:
-      raise NotImplementedError('`class_weight` is currently not supported '
-                                'when using tf.distribute.Strategy.')
-
-    if (sample_weight is not None and sample_weight.all() and
-        distributed_training_utils.is_tpu_strategy(
-            self._distribution_strategy)):
-      raise NotImplementedError('`sample_weight` is currently not supported '
-                                'when using TPUStrategy.')
-
-    # Validates `steps` and `shuffle` arguments right at the beginning
-    # since we use it to construct the dataset object.
-    # TODO(anjalisridhar): Remove this check once we refactor the
-    # _standardize_user_data code path. This check is already present elsewhere
-    # in the codebase.
-    if isinstance(x, dataset_ops.DatasetV2):
-      if shuffle:
-        training_utils.verify_dataset_shuffled(x)
-
-    strategy = self._distribution_strategy
-    with strategy.scope():
-      # We should be sure to call get_session() inside the strategy.scope()
-      # so the strategy can affect the session options.
-      if ops.executing_eagerly_outside_functions():
-        session = None
-      else:
-        session = K.get_session()
-
-      first_x_value = nest.flatten(x)[0]
-      if isinstance(first_x_value, np.ndarray):
-        x = training_utils.list_to_tuple(x)
-        if y is not None:
-          y = training_utils.list_to_tuple(y)
-          if sample_weight is not None:
-            sample_weight = training_utils.list_to_tuple(sample_weight)
-            in_tuple = (x, y, sample_weight)
-          else:
-            in_tuple = (x, y)
-        else:
-          in_tuple = x
-
-        ds = strategy.extended.experimental_make_numpy_dataset(in_tuple,
-                                                               session=session)
-        if shuffle:
-          # We want a buffer size that is larger than the batch size provided by
-          # the user and provides sufficient randomness. Note that larger
-          # numbers introduce more memory usage based on the size of each
-          # sample.
-          ds = ds.shuffle(max(1024, batch_size * 8))
-        if epochs > 1:
-          ds = ds.repeat(epochs)
-
-        # We need to use the drop_remainder argument to get a known static
-        # input shape which is required for TPUs.
-        drop_remainder = (not allow_partial_batch and
-                          strategy.extended.experimental_require_static_shapes)
-
-        # TODO(b/131720208): We still drop remainder here if number of examples
-        # is divisible by batch size, as sometimes dynamic padder will time out
-        # with keras.metrics.CategoricalAccuracy() metric.
-        if distributed_training_utils.is_tpu_strategy(
-            strategy) and not drop_remainder:
-          dataset_size = first_x_value.shape[0]
-          if dataset_size % batch_size == 0:
-            drop_remainder = True
-
-        x = ds.batch(batch_size, drop_remainder=drop_remainder)
-      else:
-        assert isinstance(x, dataset_ops.DatasetV2)
-        training_utils.validate_dataset_input(x, y, sample_weight,
-                                              validation_split)
-    return x
-
-  def _standardize_user_data(self,
-                             x,
-                             y=None,
-                             sample_weight=None,
-                             class_weight=None,
-                             batch_size=None,
-                             check_steps=False,
-                             steps_name='steps',
-                             steps=None,
-                             validation_split=0,
-                             shuffle=False,
-                             extract_tensors_from_dataset=False):
-    """Runs validation checks on input and target data passed by the user.
-
-    Also standardizes the data to lists of arrays, in order.
-
-    Also builds and compiles the model on the fly if it is a subclassed model
-    that has never been called before (and thus has no inputs/outputs).
-
-    This is a purely internal method, subject to refactoring at any time.
-
-    Args:
-      x: Input data. It could be:
-        - A Numpy array (or array-like), or a list of arrays
-          (in case the model has multiple inputs).
-        - A TensorFlow tensor, or a list of tensors
-          (in case the model has multiple inputs).
-        - A dict mapping input names to the corresponding array/tensors,
-          if the model has named inputs.
-        - A `tf.data` dataset.
-      y: Target data. Like the input data `x`,
-        it could be either Numpy array(s) or TensorFlow tensor(s).
-        It should be consistent with `x` (you cannot have Numpy inputs and
-        tensor targets, or inversely). If `x` is a dataset, `y` should not be
-        specified (since targets will be obtained from the iterator).
-      sample_weight: An optional sample-weight array passed by the user to
-        weight the importance of each sample in `x`.
-      class_weight: An optional class-weight array by the user to
-        weight the importance of samples in `x` based on the class they belong
-        to, as conveyed by `y`. If both `sample_weight` and `class_weight` are
-        provided, the weights are multiplied.
-      batch_size: Integer batch size. If provided, it is used to run additional
-        validation checks on stateful models.
-      check_steps: boolean, True if we want to check for validity of `steps` and
-        False, otherwise. For example, when we are standardizing one batch of
-        data for train_on_batch/predict_on_batch/test_on_batch APIs, `steps`
-        value is not required and we should not check for its validity in these
-        cases.
-      steps_name: The public API's parameter name for `steps`.
-      steps: Integer or `None`. Total number of steps (batches of samples) to
-        execute.
-      validation_split: Float between 0 and 1.
-        Fraction of the training data to be used as validation data.
-      shuffle: Boolean whether to shuffle the training data before each epoch.
-      extract_tensors_from_dataset: Boolean. When `x` is a dataset instance,
-        this indicates whether to extract actual tensors from the dataset or
-        instead output the dataset instance itself.
-        Set to True when calling from `train_on_batch`/etc.
-
-    Returns:
-      A tuple of 3: inputs (arrays or dicts, depending on whether `x` was a dict
-      or not), target arrays, sample-weight arrays.
-      If the model's input and targets are symbolic, these lists are empty
-      (since the model takes no user-provided data, instead the data comes
-      from the symbolic inputs/targets).
-
-    Raises:
-      ValueError: In case of invalid user-provided data.
-      RuntimeError: If the model was never compiled.
-    """
-    if isinstance(x, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
-      # Graph mode dataset. We'll pass the dataset as-is (unless
-      # `extract_tensors_from_dataset` is True, in which case we extract
-      # the tensors from the dataset and we output them.
-      training_utils.validate_dataset_input(x, y, sample_weight,
-                                            validation_split)
-      if shuffle:
-        training_utils.verify_dataset_shuffled(x)
-
-      is_dataset = True
-      if extract_tensors_from_dataset:
-        # We do this for `train_on_batch`/etc.
-        x, y, sample_weight = training_utils.extract_tensors_from_dataset(x)
-    elif isinstance(x, iterator_ops.Iterator):
-      # Graph mode iterator. We extract the symbolic tensors.
-      training_utils.validate_dataset_input(x, y, sample_weight,
-                                            validation_split)
-      iterator = x
-      x, y, sample_weight = training_utils.unpack_iterator_input(iterator)
-      is_dataset = True
-    else:
-      is_dataset = False
-
-    # Validates `steps` argument based on x's type.
-    if check_steps:
-      training_utils.check_steps_argument(x, steps, steps_name)
-
-    # First, we build the model on the fly if necessary.
-    if not self.inputs:
-      all_inputs, y_input, dict_inputs = self._build_model_with_inputs(x, y)
-      is_build_called = True
-    else:
-      all_inputs = []
-      # Whether this is a subclassed model that expects dictionary inputs
-      # rather than list inputs (e.g. FeatureColumn-based models).
-      dict_inputs = isinstance(self.inputs, dict)
-      is_build_called = False
-      y_input = y
-
-    # Second, we compile the model on the fly if necessary, mostly for subclass
-    # models.
-    is_compile_called = False
-    if not self._is_compiled and self.optimizer:
-      self._compile_from_inputs(all_inputs, y_input, x, y)
-      is_compile_called = True
-
-    # In graph mode, if we had just set inputs and targets as symbolic tensors
-    # by invoking build and compile on the model respectively, we do not have to
-    # feed anything to the model. Model already has input and target data as
-    # part of the graph.
-    # Note: in this case, `any` and `all` are equivalent since we disallow
-    # mixed symbolic/value inputs.
-
-    # self.run_eagerly is not free to compute, so we want to reuse the value.
-    run_eagerly = self.run_eagerly
-
-    if (not run_eagerly and is_build_called and is_compile_called and
-        not is_dataset  and any(_is_symbolic_tensor(v) for v in all_inputs)):
-      return [], [], None
-
-    return self._standardize_tensors(
-        x, y, sample_weight,
-        run_eagerly=run_eagerly,
-        dict_inputs=dict_inputs,
-        is_dataset=is_dataset,
-        class_weight=class_weight,
-        batch_size=batch_size)
-
-  def _standardize_tensors(self, x, y, sample_weight, run_eagerly, dict_inputs,
-                           is_dataset, class_weight=None, batch_size=None):
-    if run_eagerly:
-      # In eager mode, do not do shape validation
-      # since the network has no input nodes (placeholders) to be fed.
-      feed_input_names = self.input_names
-      feed_input_shapes = None
-    elif not self._is_graph_network:
-      # Case: symbolic-mode subclassed network. Do not do shape validation.
-      feed_input_names = self._feed_input_names
-      feed_input_shapes = None
-    else:
-      # Case: symbolic-mode graph network.
-      # In this case, we run extensive shape validation checks.
-      feed_input_names = self._feed_input_names
-      feed_input_shapes = self._feed_input_shapes
-
-    # Standardize the inputs.
-    if not isinstance(x, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
-      # TODO(fchollet): run static checks with dataset output shape(s).
-      x = training_utils.standardize_input_data(
-          x,
-          feed_input_names,
-          feed_input_shapes,
-          check_batch_axis=False,  # Don't enforce the batch size.
-          exception_prefix='input')
-
-    # Get typespecs for the input data and sanitize it if necessary.
-    # TODO(momernick): This should be capable of doing full input validation
-    # at all times - validate that this is so and refactor the standardization
-    # code.
-    if isinstance(x, dataset_ops.DatasetV2):
-      x_shapes = dataset_ops.get_structure(x)
-      if isinstance(x_shapes, tuple):
-        # If the output of a Dataset is a tuple, we assume it's either of the
-        # form (x_data, y_data) or (x_data, y_data, sample_weights). In either
-        # case, we only care about x_data here.
-        x_shapes = x_shapes[0]
-    else:
-      flat_inputs = nest.flatten(x, expand_composites=False)
-      flat_expected_inputs = nest.flatten(self.inputs, expand_composites=False)
-      converted_x = []
-      for (a, b) in zip(flat_inputs, flat_expected_inputs):
-        converted_x.append(_convert_scipy_sparse_tensor(a, b))
-      x = nest.pack_sequence_as(x, converted_x, expand_composites=False)
-
-      x_shapes = nest.map_structure(tf_utils.type_spec_from_value, x)
-
-    flat_inputs = nest.flatten(x_shapes, expand_composites=False)
-
-    x_expected_shapes = nest.map_structure(tf_utils.type_spec_from_value,
-                                           self.inputs)
-    flat_expected_inputs = nest.flatten(
-        x_expected_shapes, expand_composites=False)
-    for (a, b) in zip(flat_inputs, flat_expected_inputs):
-      nest.assert_same_structure(a, b, expand_composites=True)
-
-    if y is not None:
-      # Prepare self._sample_weight_modes. List with the same length as
-      # model outputs.
-      training_utils.prepare_sample_weight_modes(self._training_endpoints,
-                                                 self.sample_weight_mode)
-      feed_output_names = self._feed_output_names
-      feed_sample_weight_modes = self._sample_weight_modes
-      if not self._is_graph_network:
-        feed_output_shapes = None
-      else:
-        feed_output_shapes = self._feed_output_shapes
-
-      # Standardize the outputs.
-      y = training_utils.standardize_input_data(
-          y,
-          feed_output_names,
-          # Don't enforce target shapes to match output shapes.
-          # Precise checks will be run in `check_loss_and_target_compatibility`.
-          shapes=None,
-          check_batch_axis=False,  # Don't enforce the batch size.
-          exception_prefix='target')
-
-      # Generate sample-wise weight values given the `sample_weight` and
-      # `class_weight` arguments.
-      sample_weights = training_utils.standardize_sample_weights(
-          sample_weight, feed_output_names)
-      class_weights = training_utils.standardize_class_weights(
-          class_weight, feed_output_names)
-
-      sample_weights = [
-          training_utils.standardize_weights(ref, sw, cw, mode)
-          for (ref, sw, cw, mode) in zip(y, sample_weights, class_weights,
-                                         feed_sample_weight_modes)
-      ]
-      # Check that all arrays have the same length.
-      if not self._distribution_strategy:
-        training_utils.check_array_lengths(x, y, sample_weights)
-        if self._is_graph_network and not run_eagerly:
-          # Additional checks to avoid users mistakenly using improper loss fns.
-          training_utils.check_loss_and_target_compatibility(
-              y, self._feed_loss_fns, feed_output_shapes)
-
-      sample_weights, _, _ = training_utils.handle_partial_sample_weights(
-          y, sample_weights, feed_sample_weight_modes, check_all_flat=True)
-    else:
-      y = []
-      sample_weights = None
-
-    if self.stateful and batch_size and not is_dataset:
-      # Check that for stateful networks, number of samples is a multiple
-      # of the static batch size.
-      if x[0].shape[0] % batch_size != 0:
-        raise ValueError('In a stateful network, '
-                         'you should only pass inputs with '
-                         'a number of samples that can be '
-                         'divided by the batch size. Found: ' +
-                         str(x[0].shape[0]) + ' samples')
-
-    # If dictionary inputs were provided, we return a dictionary as well.
-    if dict_inputs and not isinstance(x, (dataset_ops.DatasetV1,
-                                          dataset_ops.DatasetV2)):
-      x = dict(zip(feed_input_names, x))
-    return x, y, sample_weights
-
-  def _build_model_with_inputs(self, inputs, targets):
-    """Build the model (set model inputs/outputs), mainly for subclass model."""
-    processed_inputs = []
-    is_dict_inputs = False
-    orig_inputs = inputs
-    # We need to use `inputs` to set the model inputs.
-    # If input data is a dataset iterator in graph mode or if it is an eager
-    # iterator and only one batch of samples is required, we fetch the data
-    # tensors from the iterator and then standardize them.
-    if isinstance(inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
-      inputs, targets, _ = training_utils.extract_tensors_from_dataset(inputs)
-    # We type-check that `inputs` and `targets` are either single arrays
-    # or lists of arrays, and extract a flat list of inputs from the passed
-    # structure.
-    training_utils.validate_input_types(inputs, orig_inputs)
-
-    if isinstance(inputs, (list, tuple)):
-      processed_inputs += list(inputs)
-    elif isinstance(inputs, dict):
-      is_dict_inputs = True
-      keys = sorted(inputs.keys())
-      processed_inputs = [inputs[k] for k in keys]
-    else:
-      processed_inputs.append(inputs)
-    # Now that we have a flat set of inputs, we make sure that none of them
-    # are CompositeTensors or CompositeTensorValues of any type (or scipy
-    # sparse arrays, which we treat as SparseTensor values). We cannot safely
-    # infer input data from an arbitrary composite tensor, so we don't try -
-    # users should explicitly add composite tensor inputs to their subclassed
-    # models.
-    for input_tensor in processed_inputs:
-      if composite_tensor_utils.is_composite_or_composite_value(input_tensor):
-        # TODO(b/132691975): Document subclass-model CT input handling.
-        raise ValueError(
-            'All SparseTensor and RaggedTensor inputs must be explicitly '
-            'declared using a keras.Input() with sparse=True or ragged=True. '
-            'We found an undeclared input %s. For Sequential models, please '
-            'add a keras.Input() as your first Layer. For subclassed models, '
-            'please call self._set_inputs() on your input set, which you can '
-            'create using keras.Input() for each input to your model.' %
-            (input_tensor,))
-    # Build the model using the retrieved inputs (value or symbolic).
-    # If values are generated from a dataset, then in symbolic-mode
-    # placeholders will be created to match the value shapes.
-    if isinstance(orig_inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2,
-                                iterator_ops.Iterator)):
-      if not self.inputs:
-        # For subclassed models, a robust input spec is not available so we
-        # must cast to the model dtype.
-        inputs = training_utils.cast_if_floating_dtype(inputs, self.dtype)
-
-      def create_tensor_spec(t):
-        return tensor_spec.TensorSpec(t.shape, t.dtype)
-
-      cast_inputs = nest.map_structure(create_tensor_spec, inputs)
-    elif training_utils.has_tensors(inputs):
-      cast_inputs = training_utils.cast_if_floating_dtype(inputs)
-    else:
-      cast_inputs = inputs
-    self._set_inputs(cast_inputs)
-    return processed_inputs, targets, is_dict_inputs
-
-  def _compile_from_inputs(self, all_inputs, target, orig_inputs, orig_target):
-    if target is not None:
-      # We need to use `y` to set the model targets.
-      if training_utils.has_tensors(target):
-        target = training_utils.cast_if_floating_dtype_and_mismatch(
-            target, self.outputs)
-      training_utils.validate_input_types(target, orig_target,
-                                          allow_dict=False, field_name='target')
-      if isinstance(target, (list, tuple)):
-        all_inputs += list(target)
-      else:
-        all_inputs.append(target)
-    # Type check that all inputs are *either* value *or* symbolic.
-    # TODO(fchollet): this check could be removed in Eager mode?
-    if any(tensor_util.is_tensor(v) for v in all_inputs):
-      if not all(tensor_util.is_tensor(v) for v in all_inputs):
-        raise ValueError('Do not pass inputs that mix Numpy arrays and '
-                         'TensorFlow tensors. '
-                         'You passed: x=' + str(orig_inputs) +
-                         '; y=' + str(orig_target))
-    is_dataset = isinstance(orig_inputs, (dataset_ops.DatasetV1,
-                                          dataset_ops.DatasetV2,
-                                          iterator_ops.Iterator))
-    if is_dataset or context.executing_eagerly():
-      target_tensors = None
-    else:
-      # Handle target tensors if any passed.
-      if target is not None:
-        if not isinstance(target, (list, tuple)):
-          target = [target]
-        target_tensors = [v for v in target if _is_symbolic_tensor(v)]
-      else:
-        target_tensors = None
-
-    self.compile(
-        optimizer=self.optimizer,
-        loss=self.loss,
-        metrics=self._compile_metrics,
-        weighted_metrics=self._compile_weighted_metrics,
-        loss_weights=self.loss_weights,
-        target_tensors=target_tensors,
-        sample_weight_mode=self.sample_weight_mode,
-        run_eagerly=self.run_eagerly)
-
-  # TODO(omalleyt): Consider changing to a more descriptive function name.
-  def _set_inputs(self, inputs, outputs=None, training=None):
-    """Set model's input and output specs based on the input data received.
-
-    This is to be used for Model subclasses, which do not know at instantiation
-    time what their inputs look like.
-
-    Args:
-      inputs: Single array, or list of arrays. The arrays could be placeholders,
-        Numpy arrays, data tensors, or TensorSpecs.
-        - if placeholders: the model is built on top of these placeholders,
-          and we expect Numpy data to be fed for them when calling `fit`/etc.
-        - if Numpy data or TensorShapes: we create placeholders matching the
-          TensorShapes or shapes of the Numpy arrays. We expect Numpy data to be
-          fed for these placeholders when calling `fit`/etc.
-        - if data tensors: the model is built on top of these tensors.
-          We do not expect any Numpy data to be provided when calling `fit`/etc.
-      outputs: None, a data tensor, or a list of tensors. If None, the
-        outputs will be determined by invoking `self.call()`, otherwise the
-        provided value will be used.
-      training: Boolean or None. Only relevant in symbolic mode. Specifies
-        whether to build the model's graph in inference mode (False), training
-        mode (True), or using the Keras learning phase (None).
-    Raises:
-      ValueError: If dict inputs are passed to a Sequential Model where the
-        first layer isn't FeatureLayer.
-    """
-    inputs = self._set_input_attrs(inputs)
-
-    if outputs is None:
-      kwargs = {}
-      if self._expects_training_arg:
-        # In V2 mode, feeding `training=None` is not allowed because any value
-        # explicitly passed by the user is respected, even `None`.`
-        if training is None and not ops.executing_eagerly_outside_functions():
-          training = K.learning_phase()
-        if training is not None:
-          kwargs['training'] = training
-      try:
-        outputs = self(inputs, **kwargs)
-      except NotImplementedError:
-        # This Model or a submodel is dynamic and hasn't overridden
-        # `compute_output_shape`.
-        outputs = None
-
-    self._set_output_attrs(outputs)
-
-  @trackable.no_automatic_dependency_tracking
-  def _set_input_attrs(self, inputs):
-    """Sets attributes related to the inputs of the Model."""
-    if self.inputs:
-      raise ValueError('Model inputs are already set.')
-
-    if self.__class__.__name__ == 'Sequential' and not self.built:
-      if tensor_util.is_tensor(inputs):
-        input_shape = (None,) + tuple(inputs.shape.as_list()[1:])
-      elif isinstance(inputs, tensor_shape.TensorShape):
-        input_shape = (None,) + tuple(inputs.as_list()[1:])
-      elif isinstance(inputs, dict):
-        # We assert that the first layer is a FeatureLayer.
-        if not training_utils.is_feature_layer(self.layers[0]):
-          raise ValueError('Passing a dictionary input to a Sequential Model '
-                           'which doesn\'t have FeatureLayer as the first layer'
-                           ' is an error.')
-        input_shape = (None,)
-      else:
-        input_shape = (None,) + tuple(inputs.shape[1:])
-      self._build_input_shape = input_shape
-
-    # Cast inputs to the compute dtype. This is primarily used
-    # when saving to determine the correct dtype in the input signature.
-    inputs = self._maybe_cast_inputs(inputs)
-
-    # On-the-fly setting of symbolic model inputs (either by using the tensor
-    # provided, or by creating a placeholder if Numpy data was provided).
-    model_inputs = training_utils.ModelInputs(inputs)
-    inputs = model_inputs.get_symbolic_inputs()
-    self.inputs = model_inputs.get_symbolic_inputs(return_single_as_list=True)
-    self.input_names = model_inputs.get_input_names()
-
-    self._feed_inputs = []
-    self._feed_input_names = []
-    self._feed_input_shapes = []
-
-    for k, v in model_inputs.as_dict():
-      if K.is_placeholder(v):
-        self._feed_input_names.append(k)
-        self._feed_inputs.append(v)
-        self._feed_input_shapes.append(K.int_shape(v))
-
-    return inputs
-
-  @trackable.no_automatic_dependency_tracking
-  def _set_output_attrs(self, outputs):
-    """Sets attributes related to the outputs of the Model."""
-    # NOTE(taylorrobie): This convention cannot be changed without updating the
-    #                    data adapter since it assumes nest.flatten ordering.
-    outputs = nest.flatten(outputs)
-    self.outputs = outputs
-    self.output_names = training_utils.generic_output_names(outputs)
-    # TODO(scottzhu): Should we cleanup the self._training_endpoints here?
-    self.built = True
-
-  @property
-  def _targets(self):
-    """The output target tensors for the model."""
-    return [
-        e.training_target.target
-        for e in self._training_endpoints
-        if e.has_training_target()
-    ]
-
-  @property
-  def _feed_targets(self):
-    return [
-        e.training_target.target
-        for e in self._training_endpoints
-        if e.has_feedable_training_target()
-    ]
-
-  @property
-  def _feed_output_names(self):
-    return [
-        e.output_name
-        for e in self._training_endpoints
-        if e.has_feedable_training_target()
-    ]
-
-  @property
-  def _feed_output_shapes(self):
-    return [
-        e.feed_output_shape
-        for e in self._training_endpoints
-        if e.has_feedable_training_target()
-    ]
-
-  @property
-  def _feed_loss_fns(self):
-    return [
-        e.loss_fn
-        for e in self._training_endpoints
-        if e.has_feedable_training_target()
-    ]
-
-  @property
-  def _loss_weights_list(self):
-    return [e.loss_weight for e in self._training_endpoints]
-
-  @property
-  def _output_loss_metrics(self):
-    if hasattr(self, '_training_endpoints'):
-      return [
-          e.output_loss_metric
-          for e in self._training_endpoints
-          if e.output_loss_metric is not None
-      ]
-    return None
-
-  @property
-  def sample_weights(self):
-    return [e.sample_weight for e in self._training_endpoints]
-
-  @property
-  def _sample_weight_modes(self):
-    return [e.sample_weight_mode for e in self._training_endpoints]
-
-  @property
-  def _feed_sample_weights(self):
-    return [e.sample_weight for e in self._training_endpoints
-            if e.sample_weight is not None]
-
-  def _maybe_load_initial_epoch_from_ckpt(self, initial_epoch, mode):
+  def _maybe_load_initial_epoch_from_ckpt(self, initial_epoch):
     """Maybe load initial epoch from ckpt considering possible worker recovery.
 
     Refer to tensorflow/python/keras/distribute/multi_worker_training_state.py
@@ -2591,375 +1468,134 @@ class Model(network.Network, version_utils.ModelVersionSelector):
 
     Arguments:
       initial_epoch: The original initial_epoch user passes in in `fit()`.
-      mode: The mode for running `model.fit()`.
 
     Returns:
       If the training is recovering from previous failure under multi-worker
       training setting, return the epoch the training is supposed to continue
       at. Otherwise, return the `initial_epoch` the user passes in.
     """
-    if hasattr(self, '_training_state'):
+    if self._training_state is not None:
       return self._training_state.maybe_load_initial_epoch_from_ckpt(
-          initial_epoch, mode)
+          initial_epoch, mode=ModeKeys.TRAIN)
     return initial_epoch
 
-  def _get_training_eval_metrics(self):
-    """Returns all the metrics that are to be reported.
-
-    This includes the output loss metrics, compile metrics/weighted metrics,
-    add_metric metrics.
-    """
-    metrics = []
-    metrics.extend(getattr(self, '_output_loss_metrics', None) or [])
-    metrics.extend(getattr(self, 'metrics', None) or [])
-    return metrics
-
   def _assert_compile_was_called(self):
     # Checks whether `compile` has been called. If it has been called,
     # then the optimizer is set. This is different from whether the
     # model is compiled
     # (i.e. whether the model is built and its inputs/outputs are set).
-    if not self.optimizer:
+    if not self._is_compiled:
       raise RuntimeError('You must compile your model before '
                          'training/testing. '
                          'Use `model.compile(optimizer, loss)`.')
 
-  def _in_multi_worker_mode(self):
-    """Method to infer if this `Model` is working in multi-worker settings.
-
-    Multi-worker training refers to the setup where the training is
-    distributed across multiple workers, as opposed to the case where
-    only a local process performs the training. This function is
-    used to infer for example whether or not a distribute coordinator
-    should be run, and thus TensorFlow servers should be started for
-    communication with other servers in the cluster, or whether or not
-    saving/restoring checkpoints is relevant for preemption fault tolerance.
-
-    Experimental. Signature and implementation are subject to change.
-
-    Returns:
-      Whether this model indicates it's working in multi-worker settings.
-    """
-    strategy = self._get_distribution_strategy()
-    return strategy and strategy.extended._in_multi_worker_mode()  # pylint: disable=protected-access
-
-  def _get_distribution_strategy(self):
-    # If the model was compiled under the scope of a `tf.distribute.Strategy',
-    # `self._distribution_strategy` would have been set and model should infer
-    # that as the used strategy (even if it's out of strategy scope already).
-    strategy = self._distribution_strategy
-
-    # Otherwise, use the strategy whose scope this is in.
-    if not strategy and ds_context.has_strategy():
-      strategy = ds_context.get_strategy()
-
-    return strategy
+  def _set_inputs(self, inputs, outputs=None, training=None):
+    """This method is for compat with Modelv1. Only inputs are needed here."""
+    self._set_save_spec(inputs)
 
   @property
   def _trackable_saved_model_saver(self):
     return model_serialization.ModelSavedModelSaver(self)
 
+  def _list_functions_for_serialization(self, serialization_cache):
+    # SavedModel needs to ignore the execution functions.
+    train_function = self.train_function
+    test_function = self.test_function
+    predict_function = self.predict_function
+    self.train_function = None
+    self.test_function = None
+    self.predict_function = None
+    functions = super(
+        Model, self)._list_functions_for_serialization(serialization_cache)
+    self.train_function = train_function
+    self.test_function = test_function
+    self.predict_function = predict_function
+    return functions
 
-class _TrainingEndpoint(object):
-  """A container for the training output/target and related entities.
-
-  In the case of model with multiple outputs, there is a one-to-one mapping
-  between model output (y_pred), model target (y_true), loss, metrics etc.
-  By unifying these entities into one class, different entity can access
-  information between each other, rather than currently access different list of
-  attributes of the model.
-  """
-
-  def __init__(self,
-               output,
-               output_name,
-               loss_fn,
-               loss_weight=None,
-               training_target=None,
-               output_loss_metric=None,
-               sample_weight=None,
-               sample_weight_mode=None):
-    """Initialize the _TrainingEndpoint.
-
-    Note that the output and output_name should be stable as long as the model
-    structure doesn't change. The training_target suppose to be mutable since
-    the information is provided via `compile()`
-
-    Args:
-      output: the output tensor of the model.
-      output_name: the unique name of the output tensor.
-      loss_fn: the loss function for the output tensor.
-      loss_weight: float, the weights for the loss.
-      training_target: the _TrainingTarget for the model.
-      output_loss_metric: the metric object for the loss function.
-      sample_weight: the weights for how a sample is weighted during metric and
-        loss calculation. Could be None.
-      sample_weight_mode: string, 'temporal', 'samplewise' or None. The mode for
-        how the sample_weight is populated.
-    """
-    self._output = output
-    self._output_name = output_name
-    self._loss_fn = loss_fn
-    self._loss_weight = loss_weight
-    self._training_target = training_target
-    self._output_loss_metric = output_loss_metric
-    self._sample_weight = sample_weight
-    self._sample_weight_mode = sample_weight_mode
-
-  @property
-  def output(self):
-    return self._output
-
-  @property
-  def output_name(self):
-    return self._output_name
-
-  @property
-  def shape(self):
-    return K.int_shape(self.output)
-
-  @property
-  def loss_fn(self):
-    return self._loss_fn
-
-  @property
-  def loss_weight(self):
-    return self._loss_weight
-
-  @loss_weight.setter
-  def loss_weight(self, value):
-    self._loss_weight = value
-
-  @property
-  def training_target(self):
-    return self._training_target
-
-  @training_target.setter
-  def training_target(self, value):
-    self._training_target = value
-
-  def create_training_target(self, target, run_eagerly=False):
-    """Create training_target instance and update the self.training_target.
-
-    Note that the input target should just be a tensor or None, and
-    corresponding training target will be created based on the output and
-    loss_fn.
-
-    Args:
-      target: the target tensor for the current output. Could be None.
-      run_eagerly: boolean, whether the model is in run_eagerly mode.
-
-    Raises:
-      ValueError if the training_target field for the current instance has
-      already been populated.
-    """
-    if self.has_training_target():
-      raise ValueError('The training_target field for the _TrainingEndpoint '
-                       'instance has already been populated')
-    if run_eagerly:
-      # When run_eagerly, the target tensor is ignored, and the None placeholder
-      # is created instead.
-      self.training_target = _TrainingTarget(
-          None, feedable=True, skip_target_weights=False)
-      return
-
-    if self.should_skip_target():
-      self.training_target = _TrainingTarget(None)
+  def _should_eval(self, epoch, validation_freq):
+    epoch = epoch + 1  # one-index the user-facing epoch.
+    if isinstance(validation_freq, int):
+      return epoch % validation_freq == 0
+    elif isinstance(validation_freq, list):
+      return epoch in validation_freq
     else:
-      if target is not None and not K.is_placeholder(target):
-        feedable = False
-        skip_target_weights = True
-      else:
-        feedable = True
-        skip_target_weights = False
+      raise ValueError('Expected `validation_freq` to be a list or int.')
 
-      if target is None:
-        target_dtype = losses.LABEL_DTYPES_FOR_LOSSES.get(
-            self.loss_fn, K.dtype(self.output))
+  ######################################################################
+  # Functions below exist only as v1 / v2 compatibility shims.
+  ######################################################################
 
-        target = K.placeholder(
-            ndim=len(self.shape),
-            name=self.output_name + '_target',
-            sparse=K.is_sparse(self.output),
-            dtype=target_dtype)
+  def _get_compile_args(self):
+    """Used for saving or cloning a Model."""
+    self._assert_compile_was_called()
+    # pylint: disable=protected-access
+    compile_args = {
+        'optimizer': self.optimizer,
+        'loss': self.compiled_loss._user_losses,
+        'metrics': self.compiled_metrics._user_metrics,
+        'weighted_metrics': self.compiled_metrics._user_weighted_metrics,
+        'loss_weights': self.compiled_loss._user_loss_weights,
+        'sample_weight_mode': None,
+    }
+    # pylint: enable=protected-access
+    return compile_args
 
-      self.training_target = _TrainingTarget(
-          target,
-          feedable=feedable,
-          skip_target_weights=skip_target_weights)
+  def _get_callback_model(self):
+    return self
+
+  def _in_multi_worker_mode(self):
+    return self.distribute_strategy.extended._in_multi_worker_mode()  # pylint: disable=protected-access
+
+  def _get_distribution_strategy(self):
+    return self.distribute_strategy
 
   @property
-  def output_loss_metric(self):
-    return self._output_loss_metric
-
-  @output_loss_metric.setter
-  def output_loss_metric(self, value):
-    self._output_loss_metric = value
-
-  @property
-  def sample_weight(self):
-    return self._sample_weight
-
-  @sample_weight.setter
-  def sample_weight(self, value):
-    self._sample_weight = value
-
-  @property
-  def sample_weight_mode(self):
-    return self._sample_weight_mode
-
-  @sample_weight_mode.setter
-  def sample_weight_mode(self, value):
-    self._sample_weight_mode = value
-
-  def should_skip_target(self):
-    return self._loss_fn is None
-
-  def should_skip_target_weights(self):
-    return (self.should_skip_target() or self.training_target is None or
-            self.training_target.skip_target_weights)
-
-  def has_training_target(self):
-    return self.training_target is not None
-
-  def has_feedable_training_target(self):
-    return (not self.should_skip_target() and
-            self.training_target is not None and self.training_target.feedable)
-
-  def loss_name(self):
-    if self._loss_fn is not None:
-      return self._output_name + '_loss'
-    return None
-
-  @property
-  def feed_output_shape(self):
-    """The output shape for the feedable target."""
-    if not self.has_feedable_training_target():
-      return None
-
-    if ((isinstance(self.loss_fn, losses.LossFunctionWrapper) and
-         self.loss_fn.fn == losses.sparse_categorical_crossentropy)) or (
-             isinstance(self.loss_fn, losses.SparseCategoricalCrossentropy)):
-      if K.image_data_format() == 'channels_first':
-        return (self.shape[0], 1) + self.shape[2:]
-      else:
-        return self.shape[:-1] + (1,)
-    elif (not isinstance(self.loss_fn, losses.Loss) or
-          (isinstance(self.loss_fn, losses.LossFunctionWrapper) and
-           (getattr(losses, self.loss_fn.fn.__name__, None) is None))):
-      # If the given loss is not an instance of the `Loss` class (custom
-      # class) or if the loss function that is wrapped is not in the
-      # `losses` module, then it is a user-defined loss and we make no
-      # assumptions about it.
-      return None
-    else:
-      return self.shape
-
-  def sample_weights_mismatch(self):
-    """Check if the sample weight and the mode match or not."""
-    # If there is a mismatch between sample weight mode and the placeholders
-    # created, then recompile the sub-graphs that depend on sample weights.
-    return (
-        (self.sample_weight_mode is not None and self.sample_weight is None) or
-        (self.sample_weight_mode is None and self.sample_weight is not None))
-
-  def populate_sample_weight(self, sample_weight, sample_weight_mode):
-    """Populate the sample weight and based on the sample weight mode."""
-    if (sample_weight is None and
-        (self.should_skip_target_weights() or sample_weight_mode is None or
-         context.executing_eagerly())):
-      self._sample_weight = None
-      return
-
-    assert sample_weight_mode in ['temporal', 'samplewise']
-    if sample_weight_mode == 'temporal':
-      default_value = [[1.]]
-      shape = [None, None]
-    else:
-      # sample_weight_mode == 'samplewise'
-      default_value = [1.]
-      shape = [None]
-
-    if sample_weight is not None:
-      if not sample_weight.shape.is_compatible_with(shape):
-        raise ValueError('Received sample weight with shape {}. Expected shape '
-                         '{}.'.format(sample_weight.shape, shape))
-      self._sample_weight = sample_weight
-    else:
-      self._sample_weight = array_ops.placeholder_with_default(
-          constant_op.constant(default_value, dtype=K.floatx()),
-          shape=shape,
-          name=self.output_name + '_sample_weights')
+  def _compile_was_called(self):
+    return self._is_compiled
 
 
-class _TrainingTarget(object):
-  """Container for a target tensor (y_true) and its metadata (shape, loss...).
+def reduce_per_replica(values, strategy, reduction='first'):
+  """Reduce PerReplica objects.
 
   Arguments:
-    target: A target tensor for the model. It may be `None` if the
-      output is excluded from loss computation. It is still kept as None
-      since each output of the model should have a corresponding target. If
-      the target is None, the rest of the attributes will be None as well.
-    feedable: Boolean, whether the target is feedable (requires data to be
-      passed in `fit` or `train_on_batch`), or not (model compiled with
-      `target_tensors` argument).
-    skip_target_weights: Boolean, whether the target should be skipped during
-      weights calculation.
-  """
-
-  def __init__(self, target, feedable=False, skip_target_weights=True):
-    self._target = target
-    self._feedable = feedable
-    self._skip_target_weights = skip_target_weights
-
-  @property
-  def target(self):
-    return self._target
-
-  @property
-  def feedable(self):
-    return self._feedable
-
-  @property
-  def skip_target_weights(self):
-    return self._skip_target_weights
-
-
-def _is_symbolic_tensor(x):
-  return tensor_util.is_tensor(x) and not isinstance(x, ops.EagerTensor)
-
-
-def _convert_scipy_sparse_tensor(value, expected_input):
-  """Handle scipy sparse tensor conversions.
-
-  This method takes a value 'value' and returns the proper conversion. If
-  value is a scipy sparse tensor and the expected input is a dense tensor,
-  we densify 'value'. If value is a scipy sparse tensor and the expected input
-  is a TF SparseTensor, we convert 'value' to a SparseTensor. If 'value' is
-  not a scipy sparse tensor, or scipy is not imported, we pass it through
-  unchanged.
-
-  Arguments:
-    value: An object that may be a scipy sparse tensor
-    expected_input: The expected input placeholder.
+    values: Structure of `PerReplica` objects or `Tensor`s. `Tensor`s are
+      returned as-is.
+    strategy: `tf.distribute.Strategy` object.
+    reduction: One of 'first', 'concat'.
 
   Returns:
-    The possibly-converted 'value'.
+    Structure of `Tensor`s.
   """
-  if issparse is not None and issparse(value):
-    if ops.is_dense_tensor_like(expected_input):
-      if ops.executing_eagerly_outside_functions():
-        # In TF2 we do not silently densify sparse matrices.
-        raise ValueError('A SciPy sparse matrix was passed to a model '
-                         'that expects dense inputs. Please densify your '
-                         'inputs first, such as by calling `x.toarray().')
-      return value.toarray()
+
+  def _reduce(v):
+    """Reduce a single `PerReplica` object."""
+    if not isinstance(v, ds_values.PerReplica):
+      return v
+    elif reduction == 'first':
+      return strategy.unwrap(v)[0]  # pylint: disable=protected-access
+    elif reduction == 'concat':
+      return concat(strategy.unwrap(v))  # pylint: disable=protected-access
     else:
-      sparse_coo = value.tocoo()
-      row, col = sparse_coo.row, sparse_coo.col
-      data, shape = sparse_coo.data, sparse_coo.shape
-      indices = np.concatenate((np.expand_dims(row, 1), np.expand_dims(col, 1)),
-                               1)
-      return sparse_tensor.SparseTensor(indices, data, shape)
-  else:
-    return value
+      raise ValueError('`reduction` must be "first" or "concat".')
+
+  return nest.map_structure(_reduce, values)
+
+
+def concat(tensors, axis=0):
+  """Concats `tensor`s along `axis`."""
+  if isinstance(tensors[0], sparse_tensor.SparseTensor):
+    return sparse_ops.sparse_concat_v2(axis=axis, sp_inputs=tensors)
+  if isinstance(tensors[0], ragged_tensor.RaggedTensor):
+    return ragged_concat_ops.concat(tensors, axis=axis)
+  return array_ops.concat(tensors, axis=axis)
+
+
+def to_numpy(tensors):
+  """Converts a structure of `Tensor`s to `NumPy` arrays."""
+
+  def _to_single_numpy(t):
+    if isinstance(t, ops.Tensor):
+      return t.numpy()
+    return t  # Don't turn ragged or sparse tensors to NumPy.
+
+  return nest.map_structure(_to_single_numpy, tensors)
diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py
index a9c746d6a52..531e576662b 100644
--- a/tensorflow/python/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/engine/training_arrays.py
@@ -226,13 +226,9 @@ def model_iteration(model,
       epochs=epochs,
       steps_per_epoch=steps_per_epoch,
       samples=num_samples_or_steps,
-      verbose=0,  # Handle ProgBarLogger separately in this loop.
+      count_mode=count_mode,
+      verbose=verbose,
       mode=mode)
-  # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready.
-  progbar = training_utils.get_progbar(
-      model, count_mode, mode != ModeKeys.PREDICT)
-  progbar.params = callbacks.params
-  progbar.params['verbose'] = verbose
 
   # Find beforehand arrays that need sparse-to-dense conversion.
   if issparse is not None and not use_steps:
@@ -259,7 +255,6 @@ def model_iteration(model,
 
   callbacks.model.stop_training = False
   callbacks._call_begin_hook(mode)
-  progbar.on_train_begin()
 
   initial_epoch = model._maybe_load_initial_epoch_from_ckpt(initial_epoch, mode)
 
@@ -275,7 +270,6 @@ def model_iteration(model,
       model.reset_metrics()
     if mode == ModeKeys.TRAIN:
       callbacks.on_epoch_begin(epoch, epoch_logs)
-    progbar.on_epoch_begin(epoch, epoch_logs)
 
     if use_steps:
       # Step-wise loop.
@@ -290,7 +284,6 @@ def model_iteration(model,
       while step < target_steps:
         batch_logs = {'batch': step, 'size': 1}
         callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
-        progbar.on_batch_begin(step, batch_logs)
 
         # Get outputs.
         try:
@@ -320,9 +313,6 @@ def model_iteration(model,
             elif step > 0:
               steps_per_epoch = step
               aggregator.steps = steps_per_epoch
-              if mode == ModeKeys.TRAIN:
-                progbar.params['steps'] = steps_per_epoch
-                progbar.progbar.target = steps_per_epoch
           else:
             # We ran out of batches while the user passed an iterator (legacy).
             callbacks.model.stop_training = True
@@ -350,7 +340,6 @@ def model_iteration(model,
         # Callbacks batch end.
         batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
         callbacks._call_batch_hook(mode, 'end', step, batch_logs)
-        progbar.on_batch_end(step, batch_logs)
         step += 1
 
         if callbacks.model.stop_training:
@@ -392,7 +381,6 @@ def model_iteration(model,
         # Callbacks batch_begin.
         batch_logs = {'batch': batch_index, 'size': len(batch_ids)}
         callbacks._call_batch_hook(mode, 'begin', batch_index, batch_logs)
-        progbar.on_batch_begin(batch_index, batch_logs)
 
         # Get outputs.
         batch_outs = f(ins_batch)
@@ -407,7 +395,6 @@ def model_iteration(model,
         # Callbacks batch end.
         batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
         callbacks._call_batch_hook(mode, 'end', batch_index, batch_logs)
-        progbar.on_batch_end(batch_index, batch_logs)
 
         if callbacks.model.stop_training:
           break
@@ -452,7 +439,6 @@ def model_iteration(model,
     if mode == ModeKeys.TRAIN:
       # Epochs only apply to `fit`.
       callbacks.on_epoch_end(epoch, epoch_logs)
-    progbar.on_epoch_end(epoch, epoch_logs)
 
     # Reinitialize dataset iterator for the next epoch.
     if reset_dataset_after_each_epoch and epoch < epochs - 1:
diff --git a/tensorflow/python/keras/engine/training_dataset_test.py b/tensorflow/python/keras/engine/training_dataset_test.py
index 684c966cdd2..79719012c47 100644
--- a/tensorflow/python/keras/engine/training_dataset_test.py
+++ b/tensorflow/python/keras/engine/training_dataset_test.py
@@ -107,8 +107,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
               validation_data=dataset, validation_steps=2)
 
     # Test with validation split
-    with self.assertRaisesRegexp(
-        ValueError, '`validation_split` argument is not supported when '):
+    with self.assertRaises(ValueError):
       model.fit(dataset,
                 epochs=1, steps_per_epoch=2, verbose=0,
                 validation_split=0.5, validation_steps=2)
@@ -124,19 +123,6 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
           verbose=0,
           sample_weight=sample_weight)
 
-    # Test invalid usage
-    with self.assertRaisesRegexp(
-        ValueError, 'The `batch_size` argument must not be specified'):
-      model.fit(dataset, batch_size=10, epochs=1, steps_per_epoch=2,
-                verbose=0)
-
-    with self.assertRaisesRegexp(
-        ValueError, 'The `batch_size` argument must not be specified'):
-      model.predict(dataset, batch_size=10, steps=2, verbose=0)
-    with self.assertRaisesRegexp(
-        ValueError, 'The `batch_size` argument must not be specified'):
-      model.evaluate(dataset, batch_size=10, steps=2, verbose=0)
-
     with self.assertRaisesRegexp(
         ValueError, '(you should not specify a target)|'
         '(`y` argument is not supported when using dataset as input.)'):
@@ -144,14 +130,11 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
                 epochs=1, steps_per_epoch=2, verbose=0)
 
     # With an infinite dataset, `steps_per_epoch`/`steps` argument is required.
-    with self.assertRaisesRegexp(
-        ValueError, 'the `steps_per_epoch` argument'):
+    with self.assertRaises(ValueError):
       model.fit(dataset, epochs=1, verbose=0)
-    with self.assertRaisesRegexp(ValueError,
-                                 'the `steps` argument'):
+    with self.assertRaises(ValueError):
       model.evaluate(dataset, verbose=0)
-    with self.assertRaisesRegexp(ValueError,
-                                 'the `steps` argument'):
+    with self.assertRaises(ValueError):
       model.predict(dataset, verbose=0)
 
   @keras_parameterized.run_with_all_model_types(exclude_models='sequential')
@@ -185,14 +168,6 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
     model.fit(dataset_tuple, epochs=1, steps_per_epoch=2, verbose=1)
     model.evaluate(dataset_tuple, steps=2, verbose=1)
 
-    predict_dataset_tuple = dataset_ops.Dataset.from_tensor_slices(
-        (input_a_np, input_b_np))
-    # TODO(b/123360757): Remove below assertion once predict() supports
-    # muti-input datasets.
-    with self.assertRaisesRegexp(ValueError,
-                                 'Error when checking model input'):
-      model.predict(predict_dataset_tuple, steps=1)
-
     # Test with dict
     input_dict = {'input_1': input_a_np, 'input_2': input_b_np}
     if testing_utils.get_model_type() == 'subclass':
@@ -457,15 +432,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
     self.assertIn('10/10', lines[-1])
 
     self.assertLen(history.history['loss'], 2)
-    # The first epoch will invoke batch begin 11 times, since it doesn't know
-    # the cardinality. The second epoch should just invoke 10 times.
-    if (testing_utils.should_run_eagerly()
-        or testing_utils.should_run_tf_function()):
-      expected_batch_begin_count = 21
-    else:
-      expected_batch_begin_count = 20
-    self.assertEqual(batch_counter.batch_begin_count,
-                     expected_batch_begin_count)
+    self.assertEqual(batch_counter.batch_begin_count, 21)
     self.assertEqual(batch_counter.batch_end_count, 20)
     model.evaluate(dataset)
     out = model.predict(dataset)
diff --git a/tensorflow/python/keras/engine/training_eager_test.py b/tensorflow/python/keras/engine/training_eager_test.py
index 8ac94f346c0..d6cd412d1ec 100644
--- a/tensorflow/python/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/engine/training_eager_test.py
@@ -194,12 +194,10 @@ class TrainingTest(keras_parameterized.TestCase):
     model.fit(dataset, epochs=1, verbose=0)
 
     # Step argument is required for infinite datasets.
-    with self.assertRaisesRegexp(ValueError,
-                                 'specify the `validation_steps` argument.'):
+    with self.assertRaises(ValueError):
       model.fit(dataset, steps_per_epoch=2, epochs=1, verbose=0,
                 validation_data=validation_dataset)
-    with self.assertRaisesRegexp(ValueError,
-                                 'specify the `validation_steps` argument.'):
+    with self.assertRaises(ValueError):
       model.fit(dataset, steps_per_epoch=2, epochs=1, verbose=0,
                 validation_data=validation_dataset)
 
@@ -355,7 +353,8 @@ class CorrectnessTest(keras_parameterized.TestCase):
     x = np.ones((20, 4)).astype(np.float32)
     y = np.random.randint(0, 3, size=(20,)).astype(np.int64)
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(2)
-    evaluation_results = dict(zip(model.metrics_names, model.evaluate(dataset)))
+    results = model.evaluate(dataset)
+    evaluation_results = dict(zip(model.metrics_names, results))
     # Rate of dropout depends on the learning phase.
     self.assertEqual(evaluation_results['regularization_loss'],
                      expected_validation_loss)
diff --git a/tensorflow/python/keras/engine/training_generator.py b/tensorflow/python/keras/engine/training_generator.py
index d19b2907aa4..1fcf3ef25e4 100644
--- a/tensorflow/python/keras/engine/training_generator.py
+++ b/tensorflow/python/keras/engine/training_generator.py
@@ -174,12 +174,9 @@ def model_iteration(model,
       steps_per_epoch=steps_per_epoch,
       batch_size=batch_size,
       samples=num_samples_or_steps,
-      verbose=0,  # Handle ProgBar as part of Callbacks once hooks are ready.
+      count_mode=count_mode,
+      verbose=verbose,
       mode=mode)
-  # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready.
-  progbar = training_utils.get_progbar(model, count_mode)
-  progbar.params = callbacks.params
-  progbar.params['verbose'] = verbose
 
   if mode == ModeKeys.PREDICT:
     aggregator = training_utils.OutputsAggregator(True, steps=steps_per_epoch)
@@ -194,7 +191,6 @@ def model_iteration(model,
 
   callbacks.model.stop_training = False
   callbacks._call_begin_hook(mode)
-  progbar.on_train_begin()
 
   initial_epoch = model._maybe_load_initial_epoch_from_ckpt(initial_epoch, mode)
 
@@ -207,7 +203,6 @@ def model_iteration(model,
     epoch_logs = {}
     if mode == ModeKeys.TRAIN:
       callbacks.on_epoch_begin(epoch, epoch_logs)
-    progbar.on_epoch_begin(epoch, epoch_logs)
 
     if steps_per_epoch is None:
       # Loop over dataset until `OutOfRangeError` is raised.
@@ -237,9 +232,6 @@ def model_iteration(model,
           elif step > 0:
             steps_per_epoch = step
             aggregator.steps = steps_per_epoch
-            if mode == ModeKeys.TRAIN:
-              progbar.params['steps'] = steps_per_epoch
-              progbar.progbar.target = steps_per_epoch
         else:
           # We ran out of batches while the user passed an iterator (legacy).
           callbacks.model.stop_training = True
@@ -259,7 +251,6 @@ def model_iteration(model,
       # Callbacks batch begin.
       batch_logs = {'batch': step, 'size': batch_size}
       callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
-      progbar.on_batch_begin(step, batch_logs)
 
       is_deferred = not model._is_compiled
       batch_outs = batch_function(*batch_data)
@@ -283,16 +274,12 @@ def model_iteration(model,
               verbose=verbose,
               mode=mode)
 
-          progbar.params = callbacks.params
-          progbar.params['verbose'] = verbose
-
       # Aggregate results.
       aggregator.aggregate(batch_outs)
 
       # Callbacks batch end.
       batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
       callbacks._call_batch_hook(mode, 'end', step, batch_logs)
-      progbar.on_batch_end(step, batch_logs)
       step += 1
 
       if callbacks.model.stop_training:
@@ -330,7 +317,6 @@ def model_iteration(model,
     if mode == ModeKeys.TRAIN:
       # Epochs only apply to `fit`.
       callbacks.on_epoch_end(epoch, epoch_logs)
-    progbar.on_epoch_end(epoch, epoch_logs)
 
     # Recreate dataset iterator for the next epoch.
     if reset_dataset_after_each_epoch and epoch < epochs - 1:
diff --git a/tensorflow/python/keras/engine/training_generator_test.py b/tensorflow/python/keras/engine/training_generator_test.py
index 30e59114e75..c9642fd7c7f 100644
--- a/tensorflow/python/keras/engine/training_generator_test.py
+++ b/tensorflow/python/keras/engine/training_generator_test.py
@@ -245,15 +245,14 @@ class TestGeneratorMethods(keras_parameterized.TestCase):
         run_eagerly=testing_utils.should_run_eagerly(),
         experimental_run_tf_function=testing_utils.should_run_tf_function())
 
-    err_msg = 'Output of generator should be a tuple of 1 or 2 or 3 elements'
-    with self.assertRaisesRegex(ValueError, err_msg):
+    with self.assertRaises(ValueError):
       model.fit_generator(invalid_generator(),
                           steps_per_epoch=5,
                           epochs=1,
                           verbose=1,
                           max_queue_size=10,
                           use_multiprocessing=False)
-    with self.assertRaisesRegex(ValueError, err_msg):
+    with self.assertRaises(ValueError):
       model.fit_generator(custom_generator(),
                           steps_per_epoch=5,
                           epochs=1,
@@ -262,12 +261,12 @@ class TestGeneratorMethods(keras_parameterized.TestCase):
                           use_multiprocessing=False,
                           validation_data=invalid_generator(),
                           validation_steps=10)
-    with self.assertRaisesRegex(ValueError, err_msg):
+    with self.assertRaises(ValueError):
       model.predict_generator(invalid_generator(),
                               steps=5,
                               max_queue_size=10,
                               use_multiprocessing=False)
-    with self.assertRaisesRegex(ValueError, err_msg):
+    with self.assertRaises(ValueError):
       model.evaluate_generator(invalid_generator(),
                                steps=5,
                                max_queue_size=10,
@@ -330,38 +329,11 @@ class TestGeneratorMethods(keras_parameterized.TestCase):
     model.evaluate(custom_generator_changing_batch_size(), steps=5)
     model.predict(custom_generator_changing_batch_size(), steps=5)
 
-  @keras_parameterized.run_with_all_model_types
-  @keras_parameterized.run_all_keras_modes
-  def test_invalid_batch_size_argument(self):
-
-    def ones_generator():
-      while True:
-        yield np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
-
-    model = testing_utils.get_small_mlp(
-        num_hidden=10, num_classes=1, input_dim=10)
-
-    model.compile(
-        'adam',
-        'binary_crossentropy',
-        run_eagerly=testing_utils.should_run_eagerly(),
-        experimental_run_tf_function=testing_utils.should_run_tf_function())
-
-    with self.assertRaisesRegexp(
-        ValueError, 'The `batch_size` argument must not be specified'):
-      model.fit(ones_generator(), batch_size=2, epochs=2)
-    with self.assertRaisesRegexp(
-        ValueError, 'The `batch_size` argument must not be specified'):
-      model.evaluate(ones_generator(), batch_size=2)
-
-    with self.assertRaisesRegexp(
-        ValueError, 'The `batch_size` argument must not be specified'):
-      model.predict(ones_generator(), batch_size=2)
-
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   @data_utils.dont_use_multiprocessing_pool
   def test_generator_dynamic_shapes(self):
+
     x = [
         'I think juice is great',
         'unknown is the best language since slicedbread',
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index ac2f3972ad8..6ee8971d567 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -20,8 +20,6 @@ from __future__ import print_function
 
 import collections
 import io
-import logging
-import re
 import sys
 
 from absl.testing import parameterized
@@ -29,16 +27,13 @@ import numpy as np
 import six
 
 from tensorflow.python import keras
-from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
-from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.callbacks import Callback
@@ -53,7 +48,6 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 try:
@@ -62,206 +56,6 @@ except ImportError:
   scipy_sparse = None
 
 
-class CompileTest(keras_parameterized.TestCase):
-
-  def _get_multi_output_model(self):
-    input_a = keras.layers.Input(shape=(3,), name='input_a')
-    output_a = keras.layers.Dense(1, name='dense_1')(input_a)
-    output_b = keras.layers.Dense(1, name='dense_2')(input_a)
-    return keras.models.Model(input_a, [output_a, output_b])
-
-  def _do_test_compile_with_model_and_single_loss(self, model, loss):
-    model.compile(
-        optimizer='adam',
-        loss=loss,
-        run_eagerly=testing_utils.should_run_eagerly(),
-        experimental_run_tf_function=testing_utils.should_run_tf_function())
-    self.assertEqual(model.loss, loss)
-
-    loss = losses.get(loss)
-    if not isinstance(loss, list):
-      loss_list = [loss] * len(model.outputs)
-
-    self.assertEqual(len(model.loss_functions), len(loss_list))
-    for i in range(len(loss_list)):
-      self.assertIsInstance(model.loss_functions[i], losses.LossFunctionWrapper)
-      if not isinstance(loss_list[i], losses.LossFunctionWrapper):
-        self.assertEqual(model.loss_functions[i].fn, loss_list[i])
-    self.assertAllEqual(model._loss_weights_list, [1.] * len(loss_list))
-
-  def test_respect_run_functions_eagerly(self):
-    with context.eager_mode():
-      model = testing_utils.get_small_sequential_mlp(
-          num_hidden=10, num_classes=2, input_dim=3)
-      model.compile('sgd', 'mse')
-      def_function.run_functions_eagerly(True)
-      self.assertTrue(model.run_eagerly)
-      def_function.run_functions_eagerly(False)
-      self.assertFalse(model.run_eagerly)
-
-  @keras_parameterized.run_all_keras_modes
-  @parameterized.named_parameters(('loss_string', 'mse'),
-                                  ('loss_function', losses.mean_squared_error),
-                                  ('loss_instance', losses.MeanSquaredError()))
-  def test_compile_with_single_output(self, loss):
-    model = testing_utils.get_small_sequential_mlp(
-        num_hidden=10, num_classes=2, input_dim=3)
-    self._do_test_compile_with_model_and_single_loss(model, loss)
-
-  @keras_parameterized.run_all_keras_modes
-  @parameterized.named_parameters(('loss_string', 'mse'),
-                                  ('loss_function', losses.mean_squared_error),
-                                  ('loss_instance', losses.MeanSquaredError()))
-  def test_compile_with_multi_output(self, loss):
-    model = self._get_multi_output_model()
-    self._do_test_compile_with_model_and_single_loss(model, loss)
-
-  @keras_parameterized.run_all_keras_modes
-  def test_compile_with_multi_output_and_multi_loss(self):
-    model = self._get_multi_output_model()
-    # Test loss is a list.
-    loss = ['mse', 'mae']
-    model.compile(
-        optimizer='adam',
-        loss=loss,
-        run_eagerly=testing_utils.should_run_eagerly(),
-        experimental_run_tf_function=testing_utils.should_run_tf_function())
-    self.assertEqual(model.loss_functions[0].fn, losses.mean_squared_error)
-    self.assertEqual(model.loss_functions[1].fn, losses.mean_absolute_error)
-    self.assertAllEqual(model._loss_weights_list, [1., 1.])
-
-    # Test loss is a dict.
-    loss = {'dense_1': 'mae', 'dense_2': 'mse'}
-    model.compile(
-        optimizer='adam',
-        loss=loss,
-        run_eagerly=testing_utils.should_run_eagerly(),
-        experimental_run_tf_function=testing_utils.should_run_tf_function())
-    self.assertEqual(model.loss_functions[0].fn, losses.mean_absolute_error)
-    self.assertEqual(model.loss_functions[1].fn, losses.mean_squared_error)
-    self.assertAllEqual(model._loss_weights_list, [1., 1.])
-
-  @keras_parameterized.run_all_keras_modes
-  def test_compile_with_multi_output_and_loss_weights_list(self):
-    model = self._get_multi_output_model()
-    loss_weights = [1., 2.]
-    model.compile(
-        optimizer='adam',
-        loss='mse',
-        loss_weights=loss_weights,
-        run_eagerly=testing_utils.should_run_eagerly(),
-        experimental_run_tf_function=testing_utils.should_run_tf_function())
-    self.assertAllEqual(model._loss_weights_list, [1., 2.])
-
-  def test_compile_with_multi_output_and_loss_weights_dict(self):
-    with ops.get_default_graph().as_default():
-      model = self._get_multi_output_model()
-      loss_weights = {'dense_1': 1., 'dense_2': 2.}
-      model.compile(optimizer='adam', loss='mse', loss_weights=loss_weights)
-      self.assertAllEqual(model._loss_weights_list, [1., 2.])
-
-      input_np = np.random.random((10, 3))
-      output_a_np = np.random.random((10, 1))
-      output_b_np = np.random.random((10, 1))
-
-      with self.cached_session() as sess:
-        sess.run(variables_lib.global_variables_initializer())
-        total_loss, y_preds = sess.run(
-            [model.total_loss, model.outputs],
-            feed_dict={
-                'input_a:0': input_np,
-                'dense_1_target:0': output_a_np,
-                'dense_2_target:0': output_b_np
-            })
-        self.assertAllClose(
-            total_loss,
-            np.mean(
-                np.add((output_a_np - y_preds[0])**2,
-                       2 * (output_b_np - y_preds[1])**2)))
-
-  @keras_parameterized.run_all_keras_modes
-  def test_compile_with_incorrect_loss_size(self):
-    model = testing_utils.get_small_sequential_mlp(
-        num_hidden=10, num_classes=2, input_dim=3)
-    with self.assertRaisesRegexp(ValueError, 'The model has 1 outputs'):
-      model.compile(
-          optimizer='adam',
-          loss=['mse', 'mae'],
-          run_eagerly=testing_utils.should_run_eagerly(),
-          experimental_run_tf_function=testing_utils.should_run_tf_function())
-
-  @keras_parameterized.run_all_keras_modes
-  def test_compile_with_incorrect_loss_key(self):
-    model = testing_utils.get_small_sequential_mlp(
-        num_hidden=10, num_classes=2, input_dim=3)
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'Unknown entries in loss dictionary: \[\'unknown_output\'\]. '
-        r'Only expected following keys: \[\'dense_1\'\]'):
-      model.compile(
-          optimizer='adam',
-          loss={'unknown_output': 'mse'},
-          run_eagerly=testing_utils.should_run_eagerly(),
-          experimental_run_tf_function=testing_utils.should_run_tf_function())
-
-  @keras_parameterized.run_all_keras_modes
-  def test_compile_with_incorrect_loss_weights_size(self):
-    model = testing_utils.get_small_sequential_mlp(
-        num_hidden=10, num_classes=2, input_dim=3)
-    with self.assertRaisesRegexp(ValueError,
-                                 'it should have one entry per model output'):
-      model.compile(
-          optimizer='adam',
-          loss='mse',
-          loss_weights=[1., 2.],
-          run_eagerly=testing_utils.should_run_eagerly(),
-          experimental_run_tf_function=testing_utils.should_run_tf_function())
-
-  @keras_parameterized.run_all_keras_modes
-  def test_compile_with_incorrect_loss_weights_key(self):
-    model = testing_utils.get_small_sequential_mlp(
-        num_hidden=10, num_classes=2, input_dim=3)
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'Unknown entries in loss_weights dictionary: \[\'unknown_output\'\]. '
-        r'Only expected following keys: \[\'dense_1\'\]'):
-      model.compile(
-          optimizer='adam',
-          loss='mse',
-          loss_weights={'unknown_output': 1.},
-          run_eagerly=testing_utils.should_run_eagerly(),
-          experimental_run_tf_function=testing_utils.should_run_tf_function())
-
-  @keras_parameterized.run_all_keras_modes
-  def test_compile_with_incorrect_sample_weight_mode(self):
-    model = testing_utils.get_small_sequential_mlp(
-        num_hidden=10, num_classes=2, input_dim=3)
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'Unknown entries in sample_weight_mode dictionary: \[\'unknown\'\]. '
-        r'Only expected following keys: \[\'dense_1\'\]'):
-      model.compile(
-          optimizer='adam',
-          loss='mse',
-          sample_weight_mode={'unknown': 'temporal'},
-          run_eagerly=testing_utils.should_run_eagerly(),
-          experimental_run_tf_function=testing_utils.should_run_tf_function())
-
-  def test_compile_with_session_kwargs(self):
-    with ops.Graph().as_default():
-      model = testing_utils.get_small_sequential_mlp(
-          num_hidden=10, num_classes=2, input_dim=3)
-
-      # Test that unknown arguments are not accepted
-      with self.assertRaisesRegexp(
-          TypeError,
-          r'Invalid keyword argument'):
-        model.compile(
-            optimizer='adam',
-            loss='mse',
-            foo=True)
-
-
 class TrainingTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_with_all_model_types
@@ -356,7 +150,7 @@ class TrainingTest(keras_parameterized.TestCase):
   @keras_parameterized.run_with_all_model_types
   def test_target_dtype_matches_output(self):
 
-    def _loss_fn(labels, preds):
+    def loss_fn(labels, preds):
       self.assertEqual(labels.dtype, preds.dtype)
       return labels - preds
 
@@ -367,7 +161,7 @@ class TrainingTest(keras_parameterized.TestCase):
     targets = np.ones(10, dtype=np.float64)
     model.compile(
         'sgd',
-        loss=_loss_fn,
+        loss=loss_fn,
         run_eagerly=testing_utils.should_run_eagerly(),
         experimental_run_tf_function=testing_utils.should_run_tf_function())
     model.train_on_batch(inputs, targets)
@@ -584,31 +378,6 @@ class TrainingTest(keras_parameterized.TestCase):
         batch_size=5,
         verbose=0)
 
-    # Invalid use cases
-    with self.assertRaises(ValueError):
-      model.train_on_batch({'input_a': input_a_np},
-                           [output_d_np, output_e_np])
-    with self.assertRaises(ValueError):
-      model.fit(
-          [input_a_np, input_b_np], [output_d_np, output_e_np],
-          epochs=1,
-          validation_data=([input_a_np, input_b_np], 0, 0),
-          verbose=0)
-    with self.assertRaises(ValueError):
-      model.train_on_batch([input_a_np], [output_d_np, output_e_np])
-    with self.assertRaises(ValueError):
-      model.train_on_batch(1, [output_d_np, output_e_np])
-    with self.assertRaises(ValueError):
-      model.train_on_batch(input_a_np, [output_d_np, output_e_np])
-    with self.assertRaises(ValueError):
-      bad_input = np.random.random((11, 3))
-      model.train_on_batch([bad_input, input_b_np],
-                           [output_d_np, output_e_np])
-    with self.assertRaises(ValueError):
-      bad_target = np.random.random((11, 4))
-      model.train_on_batch([input_a_np, input_b_np],
-                           [bad_target, output_e_np])
-
     # Build single-input model
     x = keras.layers.Input(shape=(3,), name='input_a')
     y = keras.layers.Dense(4)(x)
@@ -620,10 +389,6 @@ class TrainingTest(keras_parameterized.TestCase):
         experimental_run_tf_function=testing_utils.should_run_tf_function())
     # This will work
     model.fit([input_a_np], output_d_np, epochs=1)
-    # TODO(gsundeep) Test only works in eager, file ticket
-    if testing_utils.should_run_eagerly() and context.executing_eagerly():
-      with self.assertRaises(ValueError):
-        model.fit([input_a_np, input_a_np], output_d_np, epochs=1)
 
     # Test model on a list of floats
     input_a_np = np.random.random((10, 3))
@@ -841,22 +606,6 @@ class TrainingTest(keras_parameterized.TestCase):
     model.evaluate(xy_function(use_namedtuple=False), **evaluate_kwargs)
     model.predict(x_function(use_namedtuple=False), **predict_kwargs)
 
-    xy_pattern = re.escape(
-        "Received namedtuple (<class '__main__.xy_namedtuple'>) with fields "
-        "`('x', 'y')` as input.")
-    x_pattern = re.escape(
-        "Received namedtuple (<class '__main__.x_namedtuple'>) with fields "
-        "`('x',)` as input.")
-
-    with self.assertRaisesRegex(ValueError, xy_pattern):
-      model.fit(xy_function(use_namedtuple=True), **fit_kwargs)
-
-    with self.assertRaisesRegex(ValueError, xy_pattern):
-      model.evaluate(xy_function(use_namedtuple=True), **evaluate_kwargs)
-
-    with self.assertRaisesRegex(ValueError, x_pattern):
-      model.predict(x_function(use_namedtuple=True), **predict_kwargs)
-
   @keras_parameterized.run_all_keras_modes
   def test_custom_mapping_in_config(self):
 
@@ -872,41 +621,6 @@ class TrainingTest(keras_parameterized.TestCase):
     model = MyModel()
     self.assertIn('{"a": {}}', model.to_json())
 
-  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-  def test_training_on_sparse_data_with_dense_placeholders(self):
-    if scipy_sparse is None:
-      return
-
-    test_inputs = [
-        scipy_sparse.random(6, 3, density=0.25).tocsr() for _ in range(2)
-    ]
-    test_outputs = [
-        scipy_sparse.random(6, i, density=0.25).tocsr() for i in range(3, 5)
-    ]
-    in1 = keras.layers.Input(shape=(3,))
-    in2 = keras.layers.Input(shape=(3,))
-    out1 = keras.layers.Dropout(0.5, name='dropout')(in1)
-    out2 = keras.layers.Dense(4, name='dense_1')(in2)
-    model = keras.Model([in1, in2], [out1, out2])
-    model.experimental_run_tf_function = testing_utils.should_run_tf_function()
-
-    with self.assertRaisesRegexp(ValueError, 'Please densify'):
-      model.predict(test_inputs, batch_size=2)
-    optimizer = 'rmsprop'
-    model.compile(
-        optimizer,
-        'mse',
-        metrics=['mae', metrics_module.CategoricalAccuracy()],
-        run_eagerly=testing_utils.should_run_eagerly(),
-        experimental_run_tf_function=testing_utils.should_run_tf_function())
-
-    with self.assertRaisesRegexp(ValueError, 'Please densify'):
-      model.fit(test_inputs, test_outputs,
-                epochs=1, batch_size=2)
-
-    with self.assertRaisesRegexp(ValueError, 'Please densify'):
-      model.evaluate(test_inputs, test_outputs, batch_size=2)
-
   def test_training_on_sparse_data_with_dense_placeholders_v1(self):
     with ops.Graph().as_default():
       if scipy_sparse is None:
@@ -1087,66 +801,61 @@ class TrainingTest(keras_parameterized.TestCase):
     self.assertEqual(l.non_trainable_variables, [l.layer1.non_trainable_var])
     self.assertLen(l.get_weights(), 2)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_logs_passed_to_callbacks(self):
-    with self.cached_session():
-      input_dim = 5
-      num_classes = 1
+    input_dim = 5
+    num_classes = 1
 
-      class TestCallback(Callback):
+    class TestCallback(Callback):
 
-        def __init__(self):
-          super(TestCallback, self).__init__()
-          self.epoch_end_logs = None
-          self.batch_end_logs = None
-          self.epoch_end_call_count = 0
-          self.batch_end_call_count = 0
+      def __init__(self):
+        super(TestCallback, self).__init__()
+        self.epoch_end_logs = None
+        self.batch_end_logs = None
+        self.epoch_end_call_count = 0
+        self.batch_end_call_count = 0
 
-        def on_epoch_end(self, epoch, logs=None):
-          self.epoch_end_logs = logs
-          self.epoch_end_call_count += 1
+      def on_epoch_end(self, epoch, logs=None):
+        self.epoch_end_logs = logs
+        self.epoch_end_call_count += 1
 
-        def on_batch_end(self, batch, logs=None):
-          self.batch_end_logs = logs
-          self.batch_end_call_count += 1
+      def on_batch_end(self, batch, logs=None):
+        self.batch_end_logs = logs
+        self.batch_end_call_count += 1
 
-      model = testing_utils.get_small_sequential_mlp(
-          num_hidden=10, num_classes=num_classes, input_dim=input_dim)
-      model.compile(
-          loss='binary_crossentropy',
-          metrics=['acc'],
-          weighted_metrics=['mae'],
-          optimizer=RMSPropOptimizer(learning_rate=0.01))
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=num_classes, input_dim=input_dim)
+    model.compile(
+        loss='binary_crossentropy',
+        metrics=['acc'],
+        weighted_metrics=['mae'],
+        optimizer=RMSPropOptimizer(learning_rate=0.01),
+        run_eagerly=testing_utils.should_run_eagerly())
 
-      np.random.seed(1337)
-      (x_train, y_train), (_, _) = testing_utils.get_test_data(
-          train_samples=10,
-          test_samples=10,
-          input_shape=(input_dim,),
-          num_classes=num_classes)
+    np.random.seed(1337)
+    (x_train, y_train), (_, _) = testing_utils.get_test_data(
+        train_samples=10,
+        test_samples=10,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
 
-      test_callback = TestCallback()
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=2,
-          epochs=2,
-          verbose=0,
-          callbacks=[test_callback],
-          validation_data=(x_train, y_train))
-      self.assertEqual(test_callback.batch_end_call_count, 10)
-      self.assertEqual(test_callback.epoch_end_call_count, 2)
+    test_callback = TestCallback()
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=2,
+        epochs=2,
+        verbose=0,
+        callbacks=[test_callback],
+        validation_data=(x_train, y_train))
+    self.assertEqual(test_callback.batch_end_call_count, 10)
+    self.assertEqual(test_callback.epoch_end_call_count, 2)
 
-      weighted_metric = ('mae'
-                         if tf2.enabled() else 'weighted_mean_absolute_error')
-      self.assertSetEqual(
-          set(test_callback.batch_end_logs.keys()),
-          set(['batch', 'size', 'acc', 'loss', weighted_metric]))
-      self.assertSetEqual(
-          set(test_callback.epoch_end_logs.keys()),
-          set([
-              'acc', 'loss', weighted_metric, 'val_acc', 'val_loss',
-              'val_' + weighted_metric
-          ]))
+    self.assertSetEqual(
+        set(test_callback.batch_end_logs.keys()), set(['acc', 'loss', 'mae']))
+    self.assertSetEqual(
+        set(test_callback.epoch_end_logs.keys()),
+        set(['acc', 'loss', 'mae', 'val_acc', 'val_loss', 'val_mae']))
 
   @keras_parameterized.run_all_keras_modes
   def test_mismatched_output_shape_and_target_shape(self):
@@ -1160,8 +869,8 @@ class TrainingTest(keras_parameterized.TestCase):
         run_eagerly=testing_utils.should_run_eagerly(),
         experimental_run_tf_function=testing_utils.should_run_tf_function())
     # Test with Numpy data
-    x_train = np.random.random((10, 3, 4))
-    y_train = np.random.randint(0, 5, size=(10, 3))
+    x_train = np.random.random((10, 3, 4)).astype(np.float32)
+    y_train = np.random.randint(0, 5, size=(10, 3)).astype(np.float32)
     model.fit(x_train, y_train, batch_size=5, epochs=1)
 
     # Test with iterator
@@ -1238,6 +947,8 @@ class TrainingTest(keras_parameterized.TestCase):
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_static_batch_in_input_layer(self):
+    if context.executing_eagerly():
+      self.skipTest('Not inferred in eager.')
 
     class Counter(keras.callbacks.Callback):
 
@@ -1268,6 +979,8 @@ class TrainingTest(keras_parameterized.TestCase):
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_static_batch_in_input_layer_consistency_checks(self):
+    if context.executing_eagerly():
+      self.skipTest('Not inferred in eager.')
     x, y = np.ones((64, 10), 'float32'), np.ones((64, 1), 'float32')
 
     inputs = keras.Input(batch_size=2, shape=(10,))
@@ -1408,6 +1121,8 @@ class TrainingTest(keras_parameterized.TestCase):
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_validation_steps_without_data(self):
+    if context.executing_eagerly():
+      self.skipTest('Check removed in new `fit`')
     x, y = np.ones((10, 10)), np.ones((10, 1))
     model = testing_utils.get_small_mlp(2, 1, 10)
     model.compile(
@@ -1484,9 +1199,6 @@ class TrainingTest(keras_parameterized.TestCase):
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(2)
     model.fit(dataset)
     self.assertEqual(model._compute_dtype, 'float32')
-    # Input dtype should match the model dtype, even if the inputs passed to the
-    # model have a different dtype.
-    self.assertEqual(model.inputs[0].dtype, 'float32')
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_subclassed_model_with_training_arg(self):
@@ -1546,62 +1258,6 @@ class TrainingTest(keras_parameterized.TestCase):
 
 class TestExceptionsAndWarnings(keras_parameterized.TestCase):
 
-  @keras_parameterized.run_with_all_model_types
-  @keras_parameterized.run_all_keras_modes
-  def test_invalid_batch_dimension(self):
-
-    def custom_reshape(inputs):
-      return keras.backend.reshape(inputs, (-1, 8, 8, 3))
-
-    layer_1 = keras.layers.Lambda(custom_reshape)
-    layer_2 = keras.layers.Conv2D(32, (3, 3))
-
-    model = testing_utils.get_model_from_layers([layer_1, layer_2],
-                                                input_shape=(8, 8, 6))
-    model.compile('sgd', loss='mse')
-
-    with self.assertRaisesRegex(
-        ValueError,
-        'Mismatch between expected batch size and model output batch size. '
-        r'Output shape = \(20, 6, 6, 32\), expected output shape = '
-        r'shape \(10, 6, 6, 32\)'):
-      model.predict(np.ones((10, 8, 8, 6)), batch_size=10)
-
-  @keras_parameterized.run_all_keras_modes
-  def test_invalid_loss(self):
-    num_classes = 5
-    train_samples = 1000
-    test_samples = 1000
-    input_dim = 5
-
-    model = testing_utils.get_small_sequential_mlp(
-        num_hidden=10, num_classes=num_classes, input_dim=input_dim)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    model.compile(optimizer, loss='categorical_crossentropy')
-    np.random.seed(1337)
-    (x_train, y_train), (_, _) = testing_utils.get_test_data(
-        train_samples=train_samples,
-        test_samples=test_samples,
-        input_shape=(input_dim,),
-        num_classes=num_classes)
-
-    with self.assertRaisesRegexp(
-        ValueError,
-        'Input arrays should have the same number of samples as target arrays'):
-      model.fit(x_train, np.concatenate([y_train, y_train], axis=-1))
-
-    with self.assertRaisesRegexp(ValueError,
-                                 'expects targets to be binary matrices'):
-      model.fit(x_train, y_train)
-
-    with self.assertRaisesRegexp(ValueError, 'no loss to optimize'):
-      model.compile(
-          optimizer,
-          loss=None,
-          run_eagerly=testing_utils.should_run_eagerly(),
-          experimental_run_tf_function=testing_utils.should_run_tf_function())
-      model.fit(x_train)
-
   @keras_parameterized.run_all_keras_modes
   def test_compile_warning_for_loss_missing_output(self):
     with self.cached_session():
@@ -1611,98 +1267,17 @@ class TestExceptionsAndWarnings(keras_parameterized.TestCase):
       model = keras.models.Model(inputs=[inp], outputs=[out_1, out_2])
       optimizer = RMSPropOptimizer(learning_rate=0.001)
 
-      with test.mock.patch.object(logging, 'warning') as mock_log:
-        model.compile(
-            optimizer,
-            loss={
-                'dense_2': 'categorical_crossentropy',
-            },
-            metrics={
-                'dense_2': 'categorical_accuracy',
-                'dense_1': metrics_module.CategoricalAccuracy(),
-            },
-            run_eagerly=testing_utils.should_run_eagerly(),
-            experimental_run_tf_function=testing_utils.should_run_tf_function())
-        msg = ('Output dense_1 missing from loss dictionary. We assume this '
-               'was done on purpose. The fit and evaluate APIs will not be '
-               'expecting any data to be passed to dense_1.')
-        self.assertRegexpMatches(str(mock_log.call_args), msg)
-
-  @keras_parameterized.run_all_keras_modes
-  def test_invalid_steps_per_epoch_usage(self):
-    x = keras.layers.Input(shape=(1,))
-    y = keras.layers.Dense(1)(x)
-
-    model = keras.Model(x, y)
-    model.compile(
-        'sgd',
-        loss='mse',
-        run_eagerly=testing_utils.should_run_eagerly(),
-        experimental_run_tf_function=False)
-    err_msg = 'When passing input data as arrays, do not specify'
-
-    with test.mock.patch.object(logging, 'warning') as mock_log:
-      model._standardize_user_data(
-          np.zeros((100, 1)), np.ones((100, 1)), check_steps=True, steps=4)
-      self.assertRegexpMatches(str(mock_log.call_args), err_msg)
-
-  @keras_parameterized.run_with_all_model_types
-  @keras_parameterized.run_all_keras_modes
-  def test_invalid_batch_size_argument_with_sequence_input(self):
-
-    class DummySequence(data_utils.Sequence):
-
-      def __getitem__(self, idx):
-        return np.zeros([10, 2]), np.ones([10, 4])
-
-      def __len__(self):
-        return 10
-
-    model = testing_utils.get_small_mlp(
-        num_hidden=10, num_classes=1, input_dim=10)
-
-    model.compile(
-        'adam',
-        'binary_crossentropy',
-        run_eagerly=testing_utils.should_run_eagerly(),
-        experimental_run_tf_function=testing_utils.should_run_tf_function())
-
-    with self.assertRaisesRegexp(
-        ValueError, 'The `batch_size` argument must not be specified'):
-      model.fit(DummySequence(), batch_size=2, epochs=2)
-    with self.assertRaisesRegexp(
-        ValueError, 'The `batch_size` argument must not be specified'):
-      model.evaluate(DummySequence(), batch_size=2)
-
-    with self.assertRaisesRegexp(
-        ValueError, 'The `batch_size` argument must not be specified'):
-      model.predict(DummySequence(), batch_size=2)
-
-  @keras_parameterized.run_with_all_model_types
-  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-  def test_non_returning_sequence(self):
-    if not testing_utils.should_run_tf_function():
-      self.skipTest('This case is only handled in the new execution path.')
-
-    class DummySequence(data_utils.Sequence):
-
-      def __getitem__(self, idx):
-        return
-
-      def __len__(self):
-        return 10
-
-    model = testing_utils.get_small_mlp(
-        num_hidden=10, num_classes=1, input_dim=10)
-
-    model.compile(
-        'adam',
-        'binary_crossentropy',
-        run_eagerly=testing_utils.should_run_eagerly(),
-        experimental_run_tf_function=testing_utils.should_run_tf_function())
-
-    with self.assertRaisesRegexp(IndexError, 'Could not infer batch size'):
-      model.fit(DummySequence(), epochs=2)
+      model.compile(
+          optimizer,
+          loss={
+              'dense_2': 'categorical_crossentropy',
+          },
+          metrics={
+              'dense_2': 'categorical_accuracy',
+              'dense_1': metrics_module.CategoricalAccuracy(),
+          },
+          run_eagerly=testing_utils.should_run_eagerly(),
+          experimental_run_tf_function=testing_utils.should_run_tf_function())
 
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
@@ -1972,100 +1547,11 @@ class LossWeightingTest(keras_parameterized.TestCase):
     x = np.random.random((10, 3))
     y = np.random.random((10, 2))
 
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'Unknown entries in sample_weight dictionary: \[\'unknown\'\]. '
-        r'Only expected following keys: \[\'output_1\', \'output_2\'\]'):
-      model.fit([x, x], [y, y],
-                epochs=1,
-                sample_weight={'unknown': 'something'})
+    with self.assertRaises(ValueError):
+      model.fit([x, x], [y, y], epochs=1, sample_weight={'unknown': x})
 
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'Unknown entries in class_weight dictionary: \[\'unknown\'\]. '
-        r'Only expected following keys: \[\'output_1\', \'output_2\'\]'):
-      model.fit([x, x], [y, y], epochs=1, class_weight={'unknown': 'something'})
-
-  @keras_parameterized.run_all_keras_modes
-  def test_class_weight_invalid_use_case(self):
-    num_classes = 5
-    train_samples = 1000
-    test_samples = 1000
-    input_dim = 5
-    timesteps = 3
-    learning_rate = 0.001
-
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.TimeDistributed(
-              keras.layers.Dense(num_classes),
-              input_shape=(timesteps, input_dim)))
-      model.add(keras.layers.Activation('softmax'))
-      optimizer = RMSPropOptimizer(learning_rate=learning_rate)
-      model.compile(
-          optimizer,
-          loss='binary_crossentropy',
-          run_eagerly=testing_utils.should_run_eagerly(),
-          experimental_run_tf_function=testing_utils.should_run_tf_function())
-
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=train_samples,
-          test_samples=test_samples,
-          input_shape=(input_dim,),
-          num_classes=num_classes)
-      # convert class vectors to binary class matrices
-      y_train = np_utils.to_categorical(y_train, num_classes)
-      class_weight = dict([(i, 1.) for i in range(num_classes)])
-
-      del class_weight[1]
-      with self.assertRaises(ValueError):
-        model.fit(x_train, y_train,
-                  epochs=0, verbose=0, class_weight=class_weight)
-
-      with self.assertRaises(ValueError):
-        model.compile(
-            optimizer,
-            loss='binary_crossentropy',
-            sample_weight_mode=[],
-            run_eagerly=testing_utils.should_run_eagerly(),
-            experimental_run_tf_function=testing_utils.should_run_tf_function())
-
-      # Build multi-output model
-      x = keras.Input((3,))
-      y1 = keras.layers.Dense(4, name='1')(x)
-      y2 = keras.layers.Dense(4, name='2')(x)
-      model = keras.models.Model(x, [y1, y2])
-      model.compile(
-          optimizer,
-          loss='mse',
-          run_eagerly=testing_utils.should_run_eagerly(),
-          experimental_run_tf_function=testing_utils.should_run_tf_function())
-      x_np = np.random.random((10, 3))
-      y_np = np.random.random((10, 4))
-      w_np = np.random.random((10,))
-      # This will work
-      model.fit(x_np, [y_np, y_np], epochs=1,
-                sample_weight={'1': w_np})
-      # These will not
-      with self.assertRaises(ValueError):
-        model.fit(x_np, [y_np, y_np], epochs=1,
-                  sample_weight=[w_np])
-      with self.assertRaises(TypeError):
-        model.fit(x_np, [y_np, y_np], epochs=1,
-                  sample_weight=w_np)
-      with self.assertRaises(ValueError):
-        bad_w_np = np.random.random((11,))
-        model.fit(x_np, [y_np, y_np], epochs=1,
-                  sample_weight={'1': bad_w_np})
-      with self.assertRaises(ValueError):
-        bad_w_np = np.random.random((10, 2))
-        model.fit(x_np, [y_np, y_np], epochs=1,
-                  sample_weight={'1': bad_w_np})
-      with self.assertRaises(ValueError):
-        bad_w_np = np.random.random((10, 2, 2))
-        model.fit(x_np, [y_np, y_np], epochs=1,
-                  sample_weight={'1': bad_w_np})
+    with self.assertRaises(ValueError):
+      model.fit([x, x], [y, y], epochs=1, class_weight={'unknown': 1})
 
   @keras_parameterized.run_all_keras_modes
   def test_default_sample_weight(self):
@@ -2169,39 +1655,6 @@ class LossWeightingTest(keras_parameterized.TestCase):
         self.assertAllClose(
             (2+ .4 + .3 + 1) / 4, sess.run(model.total_loss, feed_dict=feeds))
 
-  def test_prepare_sample_weights(self):
-    # pylint:disable=anomalous-backslash-in-string
-    input_layer = keras.layers.Input(shape=1, name='input_layer')
-    model = keras.Model(inputs=input_layer, outputs=[input_layer, input_layer])
-    sample_weights = array_ops.constant([0, .4, 1, 1])
-    temporal_weights = array_ops.constant([[1, 2], [3, 4], [5, 6]])
-
-    model.compile(
-        loss='mean_absolute_error',
-        optimizer='adam',
-        sample_weight_mode=None)
-
-    with self.assertRaises(AssertionError):
-      model._prepare_sample_weights([sample_weights, sample_weights])
-
-    model.compile(loss='mean_absolute_error', optimizer='adam',
-                  sample_weight_mode='temporal')
-    model._prepare_sample_weights([temporal_weights, temporal_weights])
-    with self.assertRaisesRegexp(ValueError, 'Expected shape \[None, None\]'):
-      model._prepare_sample_weights([sample_weights, sample_weights])
-
-    with self.assertRaisesRegexp(ValueError,
-                                 'sample weights must have same length as the '
-                                 'number of outputs'):
-      model._prepare_sample_weights([temporal_weights])
-
-    model.compile(loss='mean_absolute_error', optimizer='adam',
-                  sample_weight_mode='samplewise')
-    model._prepare_sample_weights([sample_weights, sample_weights])
-    with self.assertRaisesRegexp(ValueError, 'Expected shape \[None\]'):
-      model._prepare_sample_weights([temporal_weights, temporal_weights])
-    # pylint:enable=anomalous-backslash-in-string
-
 
 @keras_parameterized.run_all_keras_modes
 class MaskingTest(keras_parameterized.TestCase):
@@ -2524,100 +1977,90 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
                 validation_data=(inputs, targets), validation_steps=2)
 
   def test_training_and_eval_methods_on_symbolic_tensors_multi_io(self):
-    with ops.Graph().as_default():
-      a = keras.layers.Input(shape=(3,), name='input_a')
-      b = keras.layers.Input(shape=(3,), name='input_b')
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
 
-      dense = keras.layers.Dense(4, name='dense')
-      c = dense(a)
-      d = dense(b)
-      e = keras.layers.Dropout(0.5, name='dropout')(c)
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
 
-      model = keras.models.Model([a, b], [d, e])
+    model = keras.models.Model([a, b], [d, e])
 
-      optimizer = 'rmsprop'
-      loss = 'mse'
-      loss_weights = [1., 0.5]
-      model.compile(
-          optimizer,
-          loss,
-          metrics=['mae', metrics_module.CategoricalAccuracy()],
-          loss_weights=loss_weights)
+    optimizer = 'rmsprop'
+    loss = 'mse'
+    loss_weights = [1., 0.5]
+    model.compile(
+        optimizer,
+        loss,
+        metrics=['mae', metrics_module.CategoricalAccuracy()],
+        loss_weights=loss_weights)
 
-      input_a_tf = keras.backend.zeros(shape=(10, 3))
-      input_b_tf = keras.backend.zeros(shape=(10, 3))
+    input_a_tf = array_ops.zeros(shape=(10, 3))
+    input_b_tf = array_ops.zeros(shape=(10, 3))
 
-      output_d_tf = keras.backend.zeros(shape=(10, 4))
-      output_e_tf = keras.backend.zeros(shape=(10, 4))
+    output_d_tf = array_ops.zeros(shape=(10, 4))
+    output_e_tf = array_ops.zeros(shape=(10, 4))
 
-      model.fit(
-          [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
-          epochs=1,
-          steps_per_epoch=2,
-          verbose=0)
-      with self.assertRaisesRegexp(ValueError,
-                                   'should specify the `steps_per_epoch`'):
-        model.fit(
-            [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
-            epochs=1,
-            batch_size=5,
-            verbose=0)
-      model.train_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
+    model.fit([input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+              epochs=1,
+              steps_per_epoch=2,
+              verbose=0)
+    model.train_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
 
-      # Test with dictionary inputs
-      model.fit(
-          {'input_a': input_a_tf,
-           'input_b': input_b_tf},
-          {'dense': output_d_tf,
-           'dropout': output_e_tf},
-          epochs=1,
-          steps_per_epoch=2,
-          verbose=0)
-      model.fit(
-          {'input_a': input_a_tf,
-           'input_b': input_b_tf},
-          {'dense': output_d_tf,
-           'dropout': output_e_tf},
-          validation_data=({'input_a': input_a_tf,
-                            'input_b': input_b_tf},
-                           {'dense': output_d_tf,
-                            'dropout': output_e_tf}),
-          epochs=1,
-          steps_per_epoch=2,
-          validation_steps=2,
-          verbose=0)
-      model.train_on_batch(
-          {'input_a': input_a_tf,
-           'input_b': input_b_tf},
-          {'dense': output_d_tf,
-           'dropout': output_e_tf})
+    # Test with dictionary inputs
+    model.fit({
+        'input_a': input_a_tf,
+        'input_b': input_b_tf
+    }, {
+        'dense': output_d_tf,
+        'dropout': output_e_tf
+    },
+              epochs=1,
+              steps_per_epoch=2,
+              verbose=0)
+    model.fit({
+        'input_a': input_a_tf,
+        'input_b': input_b_tf
+    }, {
+        'dense': output_d_tf,
+        'dropout': output_e_tf
+    },
+              validation_data=({
+                  'input_a': input_a_tf,
+                  'input_b': input_b_tf
+              }, {
+                  'dense': output_d_tf,
+                  'dropout': output_e_tf
+              }),
+              epochs=1,
+              steps_per_epoch=2,
+              validation_steps=2,
+              verbose=0)
+    model.train_on_batch({
+        'input_a': input_a_tf,
+        'input_b': input_b_tf
+    }, {
+        'dense': output_d_tf,
+        'dropout': output_e_tf
+    })
 
-      # Test with validation data
-      model.fit(
-          [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
-          validation_data=([input_a_tf, input_b_tf],
-                           [output_d_tf, output_e_tf]),
-          epochs=1,
-          steps_per_epoch=2,
-          validation_steps=2,
-          verbose=0)
-      # Test with validation split
-      with self.assertRaisesRegexp(ValueError,
-                                   'you cannot use `validation_split`'):
-        model.fit(
-            [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
-            epochs=2,
-            steps_per_epoch=2,
-            verbose=0,
-            validation_split=0.2,
-            validation_steps=2)
-
-      # Test evaluation / prediction methods
-      model.evaluate([input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
-                     steps=2, verbose=0)
-      model.predict([input_a_tf, input_b_tf], steps=2)
-      model.test_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
+    # Test with validation data
+    model.fit([input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+              validation_data=([input_a_tf,
+                                input_b_tf], [output_d_tf, output_e_tf]),
+              epochs=1,
+              steps_per_epoch=2,
+              validation_steps=2,
+              verbose=0)
+    # Test evaluation / prediction methods
+    model.evaluate([input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+                   steps=2,
+                   verbose=0)
+    model.predict([input_a_tf, input_b_tf], steps=2)
+    model.test_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
 
+  @tf_test_util.run_deprecated_v1
   def test_model_with_input_feed_tensor(self):
     """We test building a model with a TF variable as input.
 
@@ -2862,31 +2305,6 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
       out = model.test_on_batch(None, None)
       out = model.predict_on_batch(None)
 
-      # test fit
-      with self.assertRaises(ValueError):
-        out = model.fit(None, None, epochs=1, batch_size=10)
-      out = model.fit(None, None, epochs=1, steps_per_epoch=1)
-
-      # test fit with validation data
-      with self.assertRaises(ValueError):
-        out = model.fit(None, None, epochs=1,
-                        steps_per_epoch=None,
-                        validation_steps=2)
-        out = model.fit(None, None, epochs=1,
-                        steps_per_epoch=2,
-                        validation_steps=2)
-
-      # test evaluate
-      with self.assertRaises(ValueError):
-        out = model.evaluate(None, None, batch_size=10)
-      out = model.evaluate(None, None, steps=3)
-
-      # test predict
-      with self.assertRaises(ValueError):
-        out = model.predict(None, batch_size=10)
-      out = model.predict(None, steps=3)
-      self.assertEqual(out.shape, (10 * 3, 4))
-
       # Test multi-output model with no external data at all.
       self.evaluate(variables_lib.variables_initializer([input_v]))
       a = keras.Input(tensor=input_v)
@@ -2904,19 +2322,6 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
       out = model.test_on_batch(None, None)
       out = model.predict_on_batch(None)
 
-      # test fit
-      with self.assertRaises(ValueError):
-        out = model.fit(None, None, epochs=1, batch_size=10)
-      out = model.fit(None, None, epochs=1, steps_per_epoch=1)
-
-      # test evaluate
-      with self.assertRaises(ValueError):
-        out = model.evaluate(None, None, batch_size=10)
-      out = model.evaluate(None, None, steps=3)
-
-      # test predict
-      with self.assertRaises(ValueError):
-        out = model.predict(None, batch_size=10, verbose=1)
       out = model.predict(None, steps=3)
       self.assertEqual(len(out), 2)
       self.assertEqual(out[0].shape, (10 * 3, 4))
@@ -3074,15 +2479,13 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         run_eagerly=testing_utils.should_run_eagerly(),
         experimental_run_tf_function=testing_utils.should_run_tf_function())
 
-    mse_metric = 'mse' if tf2.enabled() else 'mean_squared_error'
+    mse_metric = 'mse' if context.executing_eagerly() else 'mean_squared_error'
     reference_metric_names = [
         'loss', 'dense_loss', 'dropout_loss', 'dense_' + mse_metric,
         'dense_binary_accuracy', 'dropout_' + mse_metric,
         'dropout_binary_accuracy'
     ]
-    self.assertEqual(reference_metric_names, model.metrics_names)
 
-    # Verify that model metric names are not altered during training.
     input_a_np = np.random.random((10, 3))
     input_b_np = np.random.random((10, 3))
 
@@ -3181,63 +2584,6 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         run_eagerly=testing_utils.should_run_eagerly(),
         experimental_run_tf_function=testing_utils.should_run_tf_function())
 
-  @keras_parameterized.run_all_keras_modes
-  def test_invalid_metrics(self):
-    num_classes = 5
-    input_dim = 5
-
-    model = testing_utils.get_small_sequential_mlp(
-        num_hidden=10, num_classes=num_classes, input_dim=input_dim)
-
-    with self.assertRaisesRegexp(
-        TypeError, 'Type of `metrics` argument not understood. '
-        'Expected a list or dictionary, found: '):
-      model.compile(
-          RMSPropOptimizer(learning_rate=0.001),
-          loss='categorical_crossentropy',
-          metrics=metrics_module.CategoricalAccuracy(),
-          run_eagerly=testing_utils.should_run_eagerly(),
-          experimental_run_tf_function=testing_utils.should_run_tf_function())
-
-    inp = keras.layers.Input(shape=(1,))
-    x = keras.layers.Dense(3, activation='relu')(inp)
-    out_1 = keras.layers.Dense(1, activation='sigmoid', name='output_1')(x)
-    out_2 = keras.layers.Dense(1, activation='sigmoid', name='output_2')(x)
-    model = keras.models.Model(inp, [out_1, out_2])
-    with self.assertRaisesRegex(
-        ValueError, 'When passing a list of lists as `metrics`, '
-        'it should have one entry per model output. '
-        'The model has 2 outputs, but you passed metrics='):
-      model.compile('rmsprop', loss='mse', metrics=[['mse']])
-
-    with self.assertRaisesRegex(
-        ValueError,
-        r'Unknown entries in metrics dictionary: \[\'output_3\'\]. Only '
-        r'expected following keys: \[\'output_1\', \'output_2\'\]'):
-      model.compile(
-          optimizer='rmsprop',
-          loss='mse',
-          metrics={
-              'output_1': 'mse',
-              'output_3': 'mse',
-          },
-          run_eagerly=testing_utils.should_run_eagerly(),
-          experimental_run_tf_function=testing_utils.should_run_tf_function())
-
-    with self.assertRaisesRegex(
-        ValueError,
-        r'Unknown entries in metrics dictionary: \[\'output_3\'\]. Only '
-        r'expected following keys: \[\'output_1\', \'output_2\'\]'):
-      model.compile(
-          optimizer='rmsprop',
-          loss='mse',
-          weighted_metrics={
-              'output_1': 'mse',
-              'output_3': 'mse',
-          },
-          run_eagerly=testing_utils.should_run_eagerly(),
-          experimental_run_tf_function=testing_utils.should_run_tf_function())
-
   @keras_parameterized.run_all_keras_modes
   def test_metrics_masking(self):
     np.random.seed(1337)
@@ -3382,7 +2728,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     self.assertEqual(history.history['metric_1'][-1], 5)
     self.assertAlmostEqual(history.history['val_metric_1'][-1], 5, 0)
 
-  @keras_parameterized.run_all_keras_modes
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_model_metrics_list(self):
 
     class LayerWithAddMetric(keras.layers.Layer):
@@ -3435,13 +2781,14 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         run_eagerly=testing_utils.should_run_eagerly(),
         experimental_run_tf_function=testing_utils.should_run_tf_function())
 
+    model.fit(np.ones((10, 1)), np.ones((10, 1)), batch_size=10)
+
     # Verify that the metrics added using `compile` and `add_metric` API are
     # included
-    self.assertEqual([m.name for m in model._compile_metrics], ['metric_4'])
     self.assertEqual([m.name for m in model.metrics],
-                     ['metric_4', 'metric_2', 'metric_1', 'metric_3'])
+                     ['loss', 'metric_4', 'metric_2', 'metric_1', 'metric_3'])
 
-  @keras_parameterized.run_all_keras_modes
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_model_metrics_list_in_call(self):
 
     class TestModel(keras.Model):
@@ -3466,8 +2813,8 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     y = np.ones(shape=(10, 2))
     model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
 
-    self.assertEqual([m.name for m in model._compile_metrics], ['acc'])
-    self.assertEqual([m.name for m in model.metrics], ['acc', 'metric_1'])
+    self.assertEqual([m.name for m in model.metrics],
+                     ['loss', 'acc', 'metric_1'])
 
   @keras_parameterized.run_all_keras_modes
   def test_multiple_add_metric_calls(self):
@@ -3508,36 +2855,6 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     model.train_on_batch(x, y)
     model.test_on_batch(x, y)
 
-  @keras_parameterized.run_with_all_model_types
-  @keras_parameterized.run_all_keras_modes
-  def test_invalid_metric_tensor(self):
-
-    class TestLayer(keras.layers.Layer):
-
-      def build(self, input_shape):
-        self.built = True
-
-      def call(self, inputs):
-        self.add_metric(math_ops.reduce_mean(inputs), name='metric_1')
-        return inputs + 1
-
-    layers = [TestLayer(input_shape=(1,))]
-    layers.append(keras.layers.Dense(2, kernel_initializer='ones'))
-    x = np.ones(shape=(10, 1))
-    y = np.ones(shape=(10, 2))
-
-    with self.assertRaisesRegexp(
-        ValueError,
-        'We do not support adding an aggregated metric result tensor that is '
-        'not the output of a `tf.keras.metrics.Metric` metric instance.'):
-      model = testing_utils.get_model_from_layers(layers, input_shape=(1,))
-      model.compile(
-          loss='mse',
-          optimizer=RMSPropOptimizer(0.01),
-          run_eagerly=testing_utils.should_run_eagerly(),
-          experimental_run_tf_function=testing_utils.should_run_tf_function())
-      model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
-
   @keras_parameterized.run_all_keras_modes
   def test_duplicate_metric_name_in_add_metric(self):
 
@@ -3677,7 +2994,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
             'one': [1.0, 1.0, 1.0]
         })
 
-  @keras_parameterized.run_all_keras_modes
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_model_with_nested_compiled_model(self):
 
     class LayerWithAddMetric(keras.layers.Layer):
@@ -3705,9 +3022,10 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         metrics=[metrics_module.Accuracy('acc')],
         run_eagerly=testing_utils.should_run_eagerly(),
         experimental_run_tf_function=testing_utils.should_run_tf_function())
+    inner_model.fit(np.ones((10, 1)), np.ones((10, 1)), batch_size=10)
 
     self.assertEqual([m.name for m in inner_model.metrics],
-                     ['acc', 'mean', 'mean1'])
+                     ['loss', 'acc', 'mean', 'mean1'])
 
     x = keras.layers.Input(shape=[1])
     y = inner_model(x)
@@ -3721,8 +3039,9 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         metrics=[metrics_module.Accuracy('acc2')],
         run_eagerly=testing_utils.should_run_eagerly(),
         experimental_run_tf_function=testing_utils.should_run_tf_function())
+    outer_model.fit(np.ones((10, 1)), np.ones((10, 1)), batch_size=10)
     self.assertEqual([m.name for m in outer_model.metrics],
-                     ['acc2', 'mean', 'mean1', 'mean2'])
+                     ['loss', 'acc2', 'mean', 'mean1', 'mean2'])
 
 
 class BareUpdateLayer(keras.layers.Layer):
diff --git a/tensorflow/python/keras/engine/training_v1.py b/tensorflow/python/keras/engine/training_v1.py
index 67840a505e9..9261ab30889 100644
--- a/tensorflow/python/keras/engine/training_v1.py
+++ b/tensorflow/python/keras/engine/training_v1.py
@@ -49,8 +49,6 @@ from tensorflow.python.keras.engine import training_distributed
 from tensorflow.python.keras.engine import training_eager
 from tensorflow.python.keras.engine import training_generator
 from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.engine import training_v2
-from tensorflow.python.keras.engine import training_v2_utils
 from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.keras.saving.saved_model import model_serialization
@@ -162,6 +160,8 @@ class Model(training_lib.Model):
     self._experimental_run_tf_function = (
         ops.executing_eagerly_outside_functions())
 
+    self._v1_compile_was_called = False
+
   @trackable.no_automatic_dependency_tracking
   def _set_strategy(self, strategy):
     self._compile_time_distribution_strategy = strategy
@@ -301,6 +301,7 @@ class Model(training_lib.Model):
     self._run_eagerly = kwargs.pop('run_eagerly', None)
     self._experimental_run_tf_function = kwargs.pop(
         'experimental_run_tf_function', True)
+    self._v1_compile_was_called = True
 
     # Prepare Session arguments (legacy).
     kwargs.pop('cloning', None)  # Legacy DistStrat argument, never used.
@@ -561,14 +562,6 @@ class Model(training_lib.Model):
                        'original `Dataset` object instead of passing in '
                        '`iter(dataset)`.')
 
-    # Experiment training loop with default DS path.
-    if context.executing_eagerly() and self._experimental_run_tf_function:
-      if self._in_multi_worker_mode():
-        return training_distributed.DistributionMultiWorkerTrainingLoop(
-            training_v2.Loop())
-      else:
-        return training_v2.Loop()
-
     # Case 1: distribution strategy.
     if self._distribution_strategy:
       if self._in_multi_worker_mode():
@@ -1031,18 +1024,6 @@ class Model(training_lib.Model):
     """
     self._assert_compile_was_called()
     self._check_call_args('train_on_batch')
-    if self._experimental_run_tf_function:
-      outputs = training_v2_utils.train_on_batch(
-          self, x, y=y, sample_weight=sample_weight,
-          class_weight=class_weight, reset_metrics=reset_metrics,
-          standalone=True)
-      outputs = (outputs['total_loss'] + outputs['output_losses'] +
-                 outputs['metrics'])
-      outputs = [
-          training_v2_utils._non_none_constant_value(v) for v in outputs]  # pylint: disable=protected-access
-      if len(outputs) == 1:
-        outputs = outputs[0]
-      return outputs
 
     # If at this point we are in the replica context, then it is okay to execute
     # the Eager code path.  The expected way to get here is to call `fit` that
@@ -1069,8 +1050,7 @@ class Model(training_lib.Model):
           output_loss_metrics=self._output_loss_metrics)
       outputs = (output_dict['total_loss'] + output_dict['output_losses']
                  + output_dict['metrics'])
-      outputs = [
-          training_v2_utils._non_none_constant_value(v) for v in outputs]  # pylint: disable=protected-access
+      outputs = [_non_none_constant_value(v) for v in outputs]  # pylint: disable=protected-access
     else:
       x = training_utils.ModelInputs(x).as_list()
       ins = x + list(y or []) + list(sample_weights or [])
@@ -1129,17 +1109,6 @@ class Model(training_lib.Model):
     """
     self._assert_compile_was_called()
     self._check_call_args('test_on_batch')
-    if self._experimental_run_tf_function:
-      outputs = training_v2_utils.test_on_batch(
-          self, x, y=y, sample_weight=sample_weight,
-          reset_metrics=reset_metrics, standalone=True)
-      outputs = (outputs['total_loss'] + outputs['output_losses'] +
-                 outputs['metrics'])
-      outputs = [
-          training_v2_utils._non_none_constant_value(v) for v in outputs]  # pylint: disable=protected-access
-      if len(outputs) == 1:
-        outputs = outputs[0]
-      return outputs
 
     if (self._distribution_strategy and
         distribution_strategy_context.in_cross_replica_context()):
@@ -1160,8 +1129,7 @@ class Model(training_lib.Model):
           output_loss_metrics=self._output_loss_metrics)
       outputs = (output_dict['total_loss'] + output_dict['output_losses']
                  + output_dict['metrics'])
-      outputs = [
-          training_v2_utils._non_none_constant_value(v) for v in outputs]  # pylint: disable=protected-access
+      outputs = [_non_none_constant_value(v) for v in outputs]  # pylint: disable=protected-access
     else:
       x = training_utils.ModelInputs(x).as_list()
       inputs = x + list(y or []) + list(sample_weights or [])
@@ -1196,8 +1164,6 @@ class Model(training_lib.Model):
           expectations of the model.
     """
     self._check_call_args('predict_on_batch')
-    if self._experimental_run_tf_function:
-      return training_v2_utils.predict_on_batch(self, x, standalone=True)
 
     if (self._distribution_strategy and
         distribution_strategy_context.in_cross_replica_context()):
@@ -2601,6 +2567,7 @@ class Model(training_lib.Model):
       ValueError: If dict inputs are passed to a Sequential Model where the
         first layer isn't FeatureLayer.
     """
+    self._set_save_spec(inputs)
     inputs = self._set_input_attrs(inputs)
 
     if outputs is None:
@@ -2760,7 +2727,7 @@ class Model(training_lib.Model):
       training setting, return the epoch the training is supposed to continue
       at. Otherwise, return the `initial_epoch` the user passes in.
     """
-    if hasattr(self, '_training_state'):
+    if self._training_state is not None:
       return self._training_state.maybe_load_initial_epoch_from_ckpt(
           initial_epoch, mode)
     return initial_epoch
@@ -2781,7 +2748,7 @@ class Model(training_lib.Model):
     # then the optimizer is set. This is different from whether the
     # model is compiled
     # (i.e. whether the model is built and its inputs/outputs are set).
-    if not self.optimizer:
+    if not self._compile_was_called:
       raise RuntimeError('You must compile your model before '
                          'training/testing. '
                          'Use `model.compile(optimizer, loss)`.')
@@ -2821,6 +2788,21 @@ class Model(training_lib.Model):
   def _trackable_saved_model_saver(self):
     return model_serialization.ModelSavedModelSaver(self)
 
+  def _get_compile_args(self):
+    self._assert_compile_was_called()
+    kwargs = {
+        'loss': self.loss,
+        'metrics': self._compile_metrics,
+        'loss_weights': self.loss_weights,
+        'sample_weight_mode': self.sample_weight_mode,
+        'weighted_metrics': self._compile_weighted_metrics,
+    }
+    return kwargs
+
+  @property
+  def _compile_was_called(self):
+    return self._v1_compile_was_called
+
 
 class DistributedCallbackModel(Model):
   """Model that is used for callbacks with tf.distribute.Strategy."""
@@ -3189,3 +3171,8 @@ def _get_metrics_from_layers(layers):
     else:
       metrics.extend(layer.metrics)
   return metrics
+
+
+def _non_none_constant_value(v):
+  constant_value = tensor_util.constant_value(v)
+  return constant_value if constant_value is not None else v
diff --git a/tensorflow/python/keras/engine/training_v2.py b/tensorflow/python/keras/engine/training_v2.py
deleted file mode 100644
index e994a8cd187..00000000000
--- a/tensorflow/python/keras/engine/training_v2.py
+++ /dev/null
@@ -1,778 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Training related logic for Keras model in TF 2.0 context.
-
-Note that all the code under this module is under active development, please DO
-NOT use it unless you are really sure what you are doing.
-"""
-
-# pylint: disable=protected-access
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-
-import numpy as np
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
-from tensorflow.python.framework import errors
-from tensorflow.python.keras import callbacks as cbks
-from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils
-from tensorflow.python.keras.engine import data_adapter
-from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.engine import training_v2_utils
-from tensorflow.python.keras.utils.mode_keys import ModeKeys
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.profiler import traceme
-from tensorflow.python.util import nest
-from tensorflow.python.util import tf_contextlib
-
-
-# The list of DataAdapter that support validation_split, only numpy and data
-# tensor support validation_split for now.
-_ADAPTER_FOR_VALIDATION_SPLIT = [data_adapter.TensorLikeDataAdapter,
-                                 data_adapter.GenericArrayLikeDataAdapter]
-
-# The list of DataAdapter that support model._standardize_user_data. Currently
-# keras.sequence/python generator will cause error when calling
-# model._standardize_user_data, this should be updated in future cl, eg, the
-# dataset/generate/sequence input will be peeked and processed by
-# model._standardize_user_data()
-_ADAPTER_FOR_STANDARDIZE_USER_DATA = [
-    data_adapter.TensorLikeDataAdapter,
-    data_adapter.GenericArrayLikeDataAdapter,
-    data_adapter.CompositeTensorDataAdapter
-]
-
-
-def run_one_epoch(model,
-                  iterator,
-                  execution_function,
-                  dataset_size=None,
-                  batch_size=None,
-                  strategy=None,
-                  steps_per_epoch=None,
-                  num_samples=None,
-                  mode=ModeKeys.TRAIN,
-                  training_context=None,
-                  total_epochs=None):
-  """Run the execution function with the data from iterator.
-
-  Given the dataset iterator and execution function, get the data from iterator
-  and call it with the execution function to get the result (metric/loss).
-  It will run for steps_per_epoch or until to the iterator is fully consumed.
-
-  Args:
-    model: The keras model to run.
-    iterator: the dataset iterator to fetch the data.
-    execution_function: a tf.function that can be called with data.
-    dataset_size: the size of iterator, None when unknown.
-    batch_size: The size of the current batch.
-    strategy: the distribution strategy instance from the model.
-    steps_per_epoch: the number of steps to run for the epoch.
-    num_samples: the number of samples for the whole epoch if known. This can be
-      used to calculate the final partial batch, and scale the loss.
-    mode: the mode for the current epoch.
-    training_context: the context that contains callbacks and progress bar.
-    total_epochs: the total number of epochs that will be run.
-      Used when throw error when the iterator unexpectedly
-      reaches its end.
-  Returns:
-    The loss and metric value from the model.
-  """
-  # Only use the sample to count if there is a partial batch at the end.
-  use_steps = num_samples is None
-
-  if mode == ModeKeys.PREDICT:
-    aggregator = training_utils.OutputsAggregator(
-        use_steps=use_steps,
-        steps=steps_per_epoch,
-        num_samples=num_samples,
-        batch_size=batch_size)
-  else:
-    aggregator = training_utils.MetricsAggregator(
-        use_steps=use_steps, steps=steps_per_epoch, num_samples=num_samples)
-  callbacks = training_context.callbacks
-  progbar = training_context.progbar
-
-  if callbacks.model.stop_training:
-    return
-
-  target_steps = steps_per_epoch or np.inf
-  step = 0
-
-  while step < target_steps:
-    if use_steps:
-      current_batch_size = 1
-    elif step < target_steps - 1:
-      current_batch_size = batch_size
-    else:
-      current_batch_size = num_samples - step * batch_size
-    with training_context.on_batch(
-        step=step, mode=mode, size=current_batch_size) as batch_logs:
-      try:
-        batch_outs = execution_function(iterator)
-      except (StopIteration, errors.OutOfRangeError):
-        # TODO(kaftan): File bug about tf function and errors.OutOfRangeError?
-        # Are there any other C++ errors tf function should recapture?
-        # The only acceptable case here is that the input has a unknown
-        # length, and configured to fully consume it.
-        if (dataset_size is None
-            and steps_per_epoch is None
-            and step > 0):
-          # The input passed by the user ran out of batches.
-          # Now we know the cardinality of the input(dataset or generator).
-          steps_per_epoch = step
-          aggregator.steps = steps_per_epoch
-          if mode == ModeKeys.TRAIN:
-            progbar.params['steps'] = steps_per_epoch
-            progbar.progbar.target = steps_per_epoch
-        else:
-          callbacks.model.stop_training = True
-          logging.warning(
-              'Your input ran out of data; interrupting training. '
-              'Make sure that your dataset or generator can generate at '
-              'least `steps_per_epoch * epochs` batches (in this case, '
-              '{} batches). You may need to use the repeat() function '
-              'when building your dataset.'.format(
-                  total_epochs * steps_per_epoch))
-        # In either case, break out the loop for training batch.
-        # Also note the training_context that data inputs are exhausted, so all
-        # the post batch hooks can be skipped.
-        batch_logs['data_exhausted'] = True
-        break
-
-      if mode != ModeKeys.PREDICT:
-        data_batch_size = batch_outs['batch_size']
-        batch_outs = (batch_outs['total_loss'] + batch_outs['output_losses']
-                      + batch_outs['metrics'])
-        if current_batch_size != data_batch_size:
-          batch_logs['size'] = data_batch_size
-          current_batch_size = data_batch_size
-      else:
-        batch_outs = training_v2_utils._aggregate_predict_results(
-            strategy, batch_outs, model)
-
-      if step == 0:
-        aggregator.create(batch_outs)
-
-      if use_steps:
-        aggregator.aggregate(batch_outs)
-      else:
-        aggregator.aggregate(
-            batch_outs,
-            batch_start=step * batch_size,
-            batch_end=step * batch_size + current_batch_size)
-      cbks.make_logs(model, batch_logs, batch_outs, mode)
-      step += 1
-
-    if callbacks.model.stop_training:
-      break
-
-  # End of an epoch.
-  aggregator.finalize()
-  return aggregator.results
-
-
-class Loop(training_utils.TrainingLoop):
-  """The training loop for the TF 2.0.
-
-  This class has some existing assumption for runtime, eg eager by default,
-  have distribution strategy, etc.
-  """
-
-  def fit(
-      self, model, x=None, y=None, batch_size=None, epochs=1, verbose=1,
-      callbacks=None, validation_split=0., validation_data=None, shuffle=True,
-      class_weight=None, sample_weight=None, initial_epoch=0,
-      steps_per_epoch=None, validation_steps=None, validation_freq=1,
-      max_queue_size=10, workers=1, use_multiprocessing=False, **kwargs):
-    batch_size = model._validate_or_infer_batch_size(
-        batch_size, steps_per_epoch, x)
-
-    strategy = model.distribute_strategy
-    batch_size, steps_per_epoch = dist_utils.process_batch_and_step_size(
-        strategy,
-        x,
-        batch_size,
-        steps_per_epoch,
-        ModeKeys.TRAIN,
-        validation_split=validation_split)
-    dist_utils.validate_callbacks(input_callbacks=callbacks,
-                                  optimizer=model.optimizer)
-    # Enter tf.distribute.Strategy scope.
-    with strategy.scope():
-      training_data_adapter, validation_adapter = _process_training_inputs(
-          model,
-          x,
-          y,
-          batch_size=batch_size,
-          epochs=epochs,
-          sample_weights=sample_weight,
-          class_weights=class_weight,
-          validation_split=validation_split,
-          steps_per_epoch=steps_per_epoch,
-          shuffle=shuffle,
-          validation_data=validation_data,
-          validation_steps=validation_steps,
-          distribution_strategy=strategy,
-          max_queue_size=max_queue_size,
-          workers=workers,
-          use_multiprocessing=use_multiprocessing)
-
-      total_samples = _get_total_number_of_samples(training_data_adapter)
-      use_sample = total_samples is not None
-      do_validation = (validation_adapter is not None)
-
-      recreate_training_iterator = (
-          training_data_adapter.should_recreate_iterator())
-      if not steps_per_epoch:
-        # TODO(b/139762795): Add step inference for when steps is None to
-        # prevent end of sequence warning message.
-        steps_per_epoch = training_data_adapter.get_size()
-
-      # tf.print('{} on {} steps.'.format(ModeKeys.TRAIN, steps_per_epoch))
-      training_context = TrainingContext()
-
-      training_dataset = training_data_adapter.get_dataset()
-      # Raise an error if steps_per_epoch isn't specified but the dataset
-      # is infinite.
-      # TODO(scottzhu): This check should probably happen in the adapter
-      inferred_steps = training_utils.infer_steps_for_dataset(
-          model,
-          training_dataset,
-          steps_per_epoch,
-          steps_name='steps_per_epoch',
-          epochs=0)
-
-      steps_per_epoch = (
-          inferred_steps if steps_per_epoch is None else steps_per_epoch)
-
-      training_dataset = strategy.experimental_distribute_dataset(
-          training_dataset)
-
-      training_function = training_v2_utils._get_or_make_execution_function(
-          model, ModeKeys.TRAIN)
-
-      training_data_iter = None
-      if do_validation:
-        validation_dataset = validation_adapter.get_dataset()
-        if not validation_steps:
-          # Raise an error if validation_steps isn't specified but the
-          # validation dataset is infinite.
-          validation_steps = (
-              validation_adapter.get_size() or
-              training_utils.infer_steps_for_dataset(
-                  model,
-                  validation_dataset,
-                  validation_steps,
-                  steps_name='validation_steps'))
-        eval_function = training_v2_utils._get_or_make_execution_function(
-            model, ModeKeys.TEST)
-        eval_data_iter = None
-        validation_dataset = strategy.experimental_distribute_dataset(
-            validation_dataset)
-        val_total_samples = _get_total_number_of_samples(validation_adapter)
-      else:
-        val_total_samples = None
-
-      if verbose and (total_samples or steps_per_epoch):
-        _print_train_info(total_samples, steps_per_epoch, val_total_samples,
-                          validation_steps)
-
-      training_callbacks = cbks.configure_callbacks(
-          callbacks,
-          model,
-          do_validation=do_validation,
-          batch_size=batch_size,
-          epochs=epochs,
-          steps_per_epoch=steps_per_epoch,
-          samples=total_samples or steps_per_epoch,
-          count_mode='samples' if use_sample else 'steps',
-          verbose=0,  # Handle ProgBarLogger separately in this loop.
-          mode=ModeKeys.TRAIN)
-
-      with training_context.on_start(model, training_callbacks, use_sample,
-                                     verbose, ModeKeys.TRAIN):
-
-        initial_epoch = model._maybe_load_initial_epoch_from_ckpt(
-            initial_epoch, ModeKeys.TRAIN)
-
-        for epoch in range(initial_epoch, epochs):
-          if training_context.callbacks.model.stop_training:
-            break
-
-          # Training
-          with training_context.on_epoch(epoch, ModeKeys.TRAIN) as epoch_logs:
-            model.reset_metrics()
-            if training_data_iter is None or recreate_training_iterator:
-              if training_data_iter is not None and ds_context.has_strategy():
-                # TODO(kaftan): remove this when MultiDeviceIterator is a
-                ## compositetensor (unless this is more efficient)
-                training_data_iter._initializer  # pylint: disable=pointless-statement
-              else:
-                training_data_iter = iter(training_dataset)
-
-            training_result = run_one_epoch(
-                model,
-                training_data_iter,
-                training_function,
-                dataset_size=training_data_adapter.get_size(),
-                batch_size=training_data_adapter.batch_size(),
-                strategy=strategy,
-                steps_per_epoch=steps_per_epoch,
-                num_samples=total_samples,
-                mode=ModeKeys.TRAIN,
-                training_context=training_context,
-                total_epochs=epochs)
-            cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN)
-
-            # In the case of steps_per_epoch = None, the final cardinality will
-            # be determined when the inputs are fully consumed (eg dataset or
-            # generator). Update the steps_per_epoch to the new value.
-            if (steps_per_epoch is None
-                and training_context.progbar.progbar.target is not None):
-              steps_per_epoch = training_context.progbar.progbar.target
-
-            # Evaluation
-            if (do_validation and
-                training_utils.should_run_validation(validation_freq, epoch) and
-                not training_callbacks.model.stop_training):
-              if eval_data_iter is not None and ds_context.has_strategy():
-                # TODO(kaftan): remove this when MultiDeviceIterator is a
-                ## compositetensor (unless this is more efficient)
-                eval_data_iter._initializer  # pylint: disable=pointless-statement
-              else:
-                eval_data_iter = iter(validation_dataset)
-
-              validation_callbacks = cbks.configure_callbacks(
-                  training_callbacks,
-                  model,
-                  batch_size=batch_size,
-                  epochs=1,
-                  steps_per_epoch=validation_steps,
-                  samples=val_total_samples or validation_steps,
-                  count_mode='samples' if use_sample else 'steps',
-                  verbose=0,  # Handle ProgBarLogger separately in this loop.
-                  mode=ModeKeys.TEST)
-
-              eval_context = TrainingContext()
-              with eval_context.on_start(
-                  model,
-                  validation_callbacks,
-                  use_sample,
-                  verbose=0,
-                  mode=ModeKeys.TEST):
-                with eval_context.on_epoch(epoch, ModeKeys.TEST):
-                  model.reset_metrics()
-                  eval_result = run_one_epoch(
-                      model,
-                      eval_data_iter,
-                      eval_function,
-                      dataset_size=validation_adapter.get_size(),
-                      batch_size=validation_adapter.batch_size(),
-                      strategy=strategy,
-                      steps_per_epoch=validation_steps,
-                      num_samples=val_total_samples,
-                      mode=ModeKeys.TEST,
-                      training_context=eval_context,
-                      total_epochs=1)
-                  cbks.make_logs(model, epoch_logs, eval_result, ModeKeys.TEST,
-                                 prefix='val_')
-
-    return model.history
-
-  def _model_iteration(
-      self, model, mode, x=None, y=None, batch_size=None, verbose=1,
-      sample_weight=None, steps=None, callbacks=None, max_queue_size=10,
-      workers=1, use_multiprocessing=False, **kwargs):
-
-    batch_size = model._validate_or_infer_batch_size(
-        batch_size, steps, x)
-    strategy = model.distribute_strategy
-    batch_size, steps = dist_utils.process_batch_and_step_size(
-        strategy, x, batch_size, steps, mode)
-    dist_utils.validate_callbacks(input_callbacks=callbacks,
-                                  optimizer=model.optimizer)
-    # Enter tf.distribute.Strategy scope.
-    with strategy.scope():
-      adapter = _process_inputs(
-          model,
-          mode,
-          x,
-          y,
-          batch_size=batch_size,
-          sample_weights=sample_weight,
-          steps=steps,
-          distribution_strategy=strategy,
-          max_queue_size=max_queue_size,
-          workers=workers,
-          use_multiprocessing=use_multiprocessing)
-      total_samples = _get_total_number_of_samples(adapter)
-      use_sample = total_samples is not None
-      dataset = adapter.get_dataset()
-
-      if not steps:
-        # Raise an error if `steps` isn't specified but the dataset
-        # is infinite.
-        steps = adapter.get_size() or training_utils.infer_steps_for_dataset(
-            model, dataset, steps, steps_name='steps')
-
-      # tf.print('{} on {} steps.'.format(ModeKeys.TRAIN, steps_per_epoch))
-      training_context = TrainingContext()
-      if training_v2_utils._should_add_batch_index_to_element(strategy, mode):
-        dataset = training_v2_utils._add_batch_index_to_element(dataset)
-      dataset = strategy.experimental_distribute_dataset(dataset)
-
-      execution_function = training_v2_utils._get_or_make_execution_function(
-          model, mode)
-
-      data_iterator = iter(dataset)
-
-      callbacks = cbks.configure_callbacks(
-          callbacks,
-          model,
-          do_validation=False,
-          batch_size=batch_size,
-          epochs=1,
-          steps_per_epoch=steps,
-          samples=total_samples,
-          count_mode='samples' if use_sample else 'steps',
-          verbose=0,  # Handle ProgBarLogger separately in this loop.
-          mode=mode)
-
-      with training_context.on_start(
-          model, callbacks, use_sample, verbose, mode):
-        with training_context.on_epoch(0, mode) as epoch_logs:
-          model.reset_metrics()
-          result = run_one_epoch(
-              model,
-              data_iterator,
-              execution_function,
-              dataset_size=adapter.get_size(),
-              batch_size=adapter.batch_size(),
-              strategy=strategy,
-              steps_per_epoch=steps,
-              num_samples=total_samples,
-              mode=mode,
-              training_context=training_context,
-              total_epochs=1)
-          cbks.make_logs(model, epoch_logs, result, mode)
-
-    if len(result) == 1:
-      result = result[0]
-    return result
-
-  def evaluate(
-      self, model, x=None, y=None, batch_size=None, verbose=1,
-      sample_weight=None, steps=None, callbacks=None, max_queue_size=10,
-      workers=1, use_multiprocessing=False, **kwargs):
-    return self._model_iteration(
-        model, ModeKeys.TEST, x=x, y=y, batch_size=batch_size, verbose=verbose,
-        sample_weight=sample_weight, steps=steps, callbacks=callbacks,
-        max_queue_size=max_queue_size, workers=workers,
-        use_multiprocessing=use_multiprocessing, **kwargs)
-
-  def predict(self, model, x, batch_size=None, verbose=0, steps=None,
-              callbacks=None, max_queue_size=10, workers=1,
-              use_multiprocessing=False, **kwargs):
-    return self._model_iteration(
-        model, ModeKeys.PREDICT, x=x, batch_size=batch_size, verbose=verbose,
-        steps=steps, callbacks=callbacks, max_queue_size=max_queue_size,
-        workers=workers, use_multiprocessing=use_multiprocessing, **kwargs)
-
-
-def _process_training_inputs(model,
-                             x,
-                             y,
-                             batch_size=None,
-                             epochs=1,
-                             sample_weights=None,
-                             class_weights=None,
-                             steps_per_epoch=None,
-                             validation_split=0.,
-                             validation_data=None,
-                             validation_steps=None,
-                             shuffle=True,
-                             distribution_strategy=None,
-                             max_queue_size=10,
-                             workers=1,
-                             use_multiprocessing=False):
-  """Process the data input for fit() with respect to validation_split."""
-  if validation_split and 0. < validation_split < 1. and validation_data:
-    raise ValueError('validation_data and validation_split cannot be used '
-                     'at same time.')
-
-  adapter_cls = data_adapter.select_data_adapter(x, y)
-
-  # Handle validation_split, we want to split the data and get the training
-  # section before we give it to data adapter.
-  if validation_split and 0. < validation_split < 1.:
-    if adapter_cls not in _ADAPTER_FOR_VALIDATION_SPLIT:
-      raise ValueError(
-          '`validation_split` argument is not supported when '
-          'data adapter is {}. Received: x={}, validation_split={}'.format(
-              adapter_cls, x, validation_split))
-    # Retrieve the training section from x and y, and then construct dataset
-    # from it.
-    x, y, sample_weights = model._standardize_user_data(
-        x,
-        y,
-        sample_weight=sample_weights,
-        class_weight=class_weights,
-        batch_size=batch_size,
-        check_steps=False,
-        steps=steps_per_epoch)
-    (x, y, sample_weights,
-     val_x, val_y,
-     val_sample_weights) = training_utils.split_training_and_validation_data(
-         x, y, sample_weights, validation_split)
-
-    sample_weight_modes = [
-        e.sample_weight_mode for e in model._training_endpoints
-    ]
-    train_adapter = adapter_cls(
-        x,
-        y,
-        batch_size=batch_size,
-        steps=steps_per_epoch,
-        epochs=epochs,
-        sample_weights=sample_weights,
-        sample_weight_modes=sample_weight_modes,
-        shuffle=shuffle,
-        distribution_strategy=distribution_strategy)
-
-    val_adapter = adapter_cls(
-        val_x,
-        val_y,
-        steps=validation_steps,
-        sample_weights=val_sample_weights,
-        sample_weight_modes=sample_weight_modes,
-        batch_size=batch_size,
-        distribution_strategy=distribution_strategy)
-  else:
-    train_adapter = _process_inputs(
-        model,
-        ModeKeys.TRAIN,
-        x,
-        y,
-        sample_weights=sample_weights,
-        batch_size=batch_size,
-        steps=steps_per_epoch,
-        epochs=epochs,
-        class_weights=class_weights,
-        shuffle=shuffle,
-        distribution_strategy=distribution_strategy,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing)
-    val_adapter = None
-    if validation_data:
-      (val_x, val_y,
-       val_sample_weights) = training_utils.unpack_validation_data(
-           validation_data, raise_if_ambiguous=False)
-      # For eval data, we use a representative batch size of the
-      # training data if batch_size was unknown.
-      # This is useful for generator/sequence training data input with numpy
-      # validation data input.
-      if not batch_size:
-        batch_size = train_adapter.representative_batch_size()
-      val_adapter = _process_inputs(
-          model,
-          ModeKeys.TEST,
-          val_x,
-          val_y,
-          steps=validation_steps,
-          sample_weights=val_sample_weights,
-          batch_size=batch_size,
-          class_weights=class_weights,
-          distribution_strategy=distribution_strategy)
-    elif validation_steps:
-      raise ValueError('`validation_steps` should not be specified if '
-                       '`validation_data` is None.')
-  return train_adapter, val_adapter
-
-
-def _process_inputs(model,
-                    mode,
-                    x,
-                    y,
-                    batch_size=None,
-                    epochs=1,
-                    sample_weights=None,
-                    class_weights=None,
-                    shuffle=False,
-                    steps=None,
-                    distribution_strategy=None,
-                    max_queue_size=10,
-                    workers=1,
-                    use_multiprocessing=False):
-  """Process the inputs for fit/eval/predict()."""
-  adapter_cls = data_adapter.select_data_adapter(x, y)
-  standardize = functools.partial(
-      model._standardize_user_data,
-      class_weight=class_weights,
-      batch_size=batch_size,
-      check_steps=False,
-      steps=steps)
-  if adapter_cls in _ADAPTER_FOR_STANDARDIZE_USER_DATA:
-    standardize_function = None
-    x, y, sample_weights = standardize(
-        x, y, sample_weight=sample_weights)
-  elif adapter_cls is data_adapter.ListsOfScalarsDataAdapter:
-    standardize_function = standardize
-  else:
-    def standardize_function(dataset):
-      """Data adapters can standardize when appropriate."""
-      # First we call _standardize_user_data with the dataset since that has
-      # enough structure to build the model.
-      if not model._is_compiled:
-        # We don't actually care about the values of these attributes, but they
-        # are only created in compile and are accessed in _standardize_user_data
-        model._training_endpoints = getattr(model, '_training_endpoints', [])
-        model.sample_weight_mode = getattr(model, 'sample_weight_mode', None)
-
-      standardize(dataset, extract_tensors_from_dataset=False)
-
-      # Then we map using only the tensor standardization portion.
-      def map_fn(x, y=None, sample_weights=None):
-        """Tensor manipulation portion of standardization for Dataset.map."""
-        if (y is None and sample_weights is None):
-          # namedtuples are forbidden because it is ambiguous if they should be
-          # unpacked. If y or sample_weights is present then `x` was not the
-          # top level structure, and the correct behavior is unambiguous.
-          data_adapter.assert_not_namedtuple(x)
-
-        standardized = model._standardize_tensors(
-            x, y, sample_weights,
-            run_eagerly=False,
-            dict_inputs=isinstance(x, dict),
-            is_dataset=False,
-            class_weight=class_weights,
-            batch_size=None)
-        x, y, sample_weights = nest._list_to_tuple(standardized)
-        if y is None:
-          return (x,)
-        if sample_weights is None:
-          return x, y
-        return x, y, sample_weights
-      return dataset.map(map_fn, num_parallel_calls=dataset_ops.AUTOTUNE)
-
-  if mode == ModeKeys.PREDICT:
-    sample_weight_modes = None
-  else:
-    sample_weight_modes = [
-        e.sample_weight_mode for e in model._training_endpoints
-    ] or model.sample_weight_mode
-
-  adapter = adapter_cls(
-      x,
-      y,
-      standardize_function=standardize_function,
-      batch_size=batch_size,
-      epochs=epochs,
-      steps=steps,
-      sample_weights=sample_weights,
-      sample_weight_modes=sample_weight_modes,
-      shuffle=shuffle,
-      distribution_strategy=distribution_strategy,
-      max_queue_size=max_queue_size,
-      workers=workers,
-      use_multiprocessing=use_multiprocessing)
-
-  return adapter
-
-
-def _get_total_number_of_samples(adapter):
-  if not adapter.get_size() or not adapter.batch_size():
-    return None
-  total_sample = adapter.get_size() * adapter.batch_size()
-  if adapter.has_partial_batch():
-    total_sample -= (adapter.batch_size() - adapter.partial_batch_size())
-  return total_sample
-
-
-def _print_train_info(total_samples, steps, val_total_samples, val_steps):
-  increment = 'samples' if total_samples else 'steps'
-  conjunction = 'on' if total_samples else 'for'
-  msg = 'Train {} {} {}'.format(conjunction, total_samples or steps, increment)
-  if val_total_samples or val_steps:
-    increment = 'samples' if val_total_samples else 'steps'
-    conjunction = 'on' if val_total_samples else 'for'
-    msg += ', validate {} {} {}'.format(conjunction, val_total_samples or
-                                        val_steps, increment)
-  print(msg)
-
-
-class TrainingContext(object):
-  """Utility object that wrap around callbacks and progress bars."""
-
-  @tf_contextlib.contextmanager
-  def on_start(self, model, callbacks=None, use_samples=False, verbose=0,
-               mode=ModeKeys.TRAIN):
-    """Provide a scope for the whole training process."""
-    # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready.
-    progbar = training_utils.get_progbar(
-        model, 'samples' if use_samples else 'steps')
-    progbar.params = callbacks.params
-    progbar.params['verbose'] = verbose
-    callbacks.model.stop_training = False
-    callbacks._call_begin_hook(mode)
-    progbar.on_train_begin()
-
-    # Cache those two instance so that it can be used in other functions.
-    self.callbacks = callbacks
-    self.progbar = progbar
-
-    try:
-      yield
-      model._successful_loop_finish = True
-    finally:
-      # End of all epochs
-      self.callbacks._call_end_hook(mode)
-
-  @tf_contextlib.contextmanager
-  def on_epoch(self, epoch=0, mode=ModeKeys.TRAIN):
-    """Provide a scope for running one epoch."""
-    epoch_logs = {}
-    if mode == ModeKeys.TRAIN:
-      self.callbacks.on_epoch_begin(epoch, epoch_logs)
-    self.progbar.on_epoch_begin(epoch, epoch_logs)
-    try:
-      yield epoch_logs
-    finally:
-      if mode == ModeKeys.TRAIN:
-        # Epochs only apply to `fit`.
-        self.callbacks.on_epoch_end(epoch, epoch_logs)
-      self.progbar.on_epoch_end(epoch, epoch_logs)
-
-  @tf_contextlib.contextmanager
-  def on_batch(self, step=0, mode=ModeKeys.TRAIN, size=1):
-    """Provide a scope for running one batch."""
-    with traceme.TraceMe(
-        'TraceContext', graph_type=mode, step_num=step, batch_size=size):
-      batch_logs = {'batch': step, 'size': size}
-      self.callbacks._call_batch_hook(
-          mode, 'begin', step, batch_logs)
-      self.progbar.on_batch_begin(step, batch_logs)
-      try:
-        yield batch_logs
-      finally:
-        if not batch_logs.pop('data_exhausted', False):
-          self.callbacks._call_batch_hook(
-              mode, 'end', step, batch_logs)
-          self.progbar.on_batch_end(step, batch_logs)
diff --git a/tensorflow/python/keras/engine/training_v2_utils.py b/tensorflow/python/keras/engine/training_v2_utils.py
deleted file mode 100644
index b7eb1b123b6..00000000000
--- a/tensorflow/python/keras/engine/training_v2_utils.py
+++ /dev/null
@@ -1,556 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Training related logic for Keras model in TF 2.0 context.
-
-Note that all the code under this module is under active development, please DO
-NOT use it unless you are really sure what you are doing.
-"""
-
-# pylint: disable=protected-access
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import functools
-
-import numpy as np
-
-from tensorflow.python.distribute import distribution_strategy_context
-from tensorflow.python.eager import def_function
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.framework.ops import composite_tensor
-from tensorflow.python.keras import backend
-from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils
-from tensorflow.python.keras.engine import training_eager
-from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.utils.mode_keys import ModeKeys
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.util import nest
-
-
-def _get_or_make_function(model, mode, key_fn, make_fn):
-  """Helper function for managing cached execution functions."""
-  model._init_distributed_function_cache_if_not_compiled()
-  key = key_fn(mode)
-
-  function = dist_utils.get_distributed_function(model, key)
-  if function:
-    return function
-
-  function = make_fn(model, mode)
-  dist_utils.set_distributed_function(model, key, function)
-  return function
-
-
-def _get_or_make_execution_function(model, mode):
-  """Makes or reuses function to run one step of distributed model execution."""
-  return _get_or_make_function(
-      model, mode,
-      # Use a key with 'v2' to distinguish from fall-back execution functions.
-      key_fn=lambda m: (m, 'v2'),
-      make_fn=_make_execution_function)
-
-
-def _make_execution_function(model, mode):
-  """Creates a function to run one step of distributed model execution."""
-  per_replica_function = _make_replica_execution_function(model, mode)
-
-  def distributed_function(input_iterator):
-    """A single step of the distributed execution across replicas."""
-    # Call `Model.{train,test,predict}_on_batch` on every replica passing
-    # PerReplicas as arguments.  On every replica inside this call, each
-    # PerReplica object will return the value for that replica.  The outputs
-    # are PerReplicas too.
-    strategy = distribution_strategy_context.get_strategy()
-    args = _prepare_feed_values(model, input_iterator, mode, strategy)
-    outputs = strategy.experimental_run_v2(
-        per_replica_function, args=args)
-    # Out of PerReplica outputs reduce or pick values to return.
-    all_outputs = dist_utils.unwrap_output_dict(
-        strategy, outputs, mode)
-    return all_outputs
-
-  if not model.run_eagerly:
-    distributed_function = def_function.function(
-        distributed_function, autograph=False)
-
-  def execution_function(input_fn):
-    # `numpy` translates Tensors to values in Eager mode.
-    return nest.map_structure(_non_none_constant_value,
-                              distributed_function(input_fn))
-
-  return execution_function
-
-
-def _get_or_make_on_batch_function(model, mode):
-  """Makes or reuses function to run one step of distributed model execution."""
-  return _get_or_make_function(
-      model, mode,
-      # Use a key with 'v2' to distinguish from fall-back execution functions.
-      key_fn=lambda m: (m, 'v2_on_batch'),
-      make_fn=_make_on_batch_function)
-
-
-def _make_on_batch_function(model, mode):
-  """Creates a function of Model.*_on_batch methods."""
-  if mode == ModeKeys.TRAIN:
-    func = training_eager.train_on_batch
-  elif mode == ModeKeys.TEST:
-    func = training_eager.test_on_batch
-  else:
-    func = model
-
-  if not model.run_eagerly:
-    # Pass `experimental_relax_shapes` to avoid retracing for dynamic batch
-    # size, variable length sequences, etc.
-    func = def_function.function(func, experimental_relax_shapes=True)
-
-  return func
-
-
-def _non_none_constant_value(v):
-  constant_value = tensor_util.constant_value(v)
-  return constant_value if constant_value is not None else v
-
-
-def _prepare_feed_values(model, inputs, mode, strategy):
-  """Prepare feed values to the model execution function.
-
-  Arguments:
-    model: Model to prepare feed values for.
-    inputs: An iterator of model inputs, targets, and sample_weights.
-      model inputs may be lists, single values, or dicts mapping input feed
-      names to values.
-    mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
-    strategy: The current distribution strategy for the model.
-
-  Returns:
-    Feed values for the model in the given mode. This is a tuple of
-    the structure (inputs, targets, sample_weights), where each of
-    (tuple, targets, sample_weights) may be a python list. Single values
-    for inputs will always be wrapped in lists.
-  """
-  # For predict, we need to extract the manually added batch_index first.
-  with_batch_index = _should_add_batch_index_to_element(strategy, mode)
-
-  inputs, targets, sample_weights, batch_index = _get_input_from_iterator(
-      inputs, with_batch_index)
-
-  # When the inputs are dict, then we want to flatten it in the same order as
-  # the input layers, such that the data are fed into the input layers in the
-  # correct order.
-  if isinstance(inputs, dict):
-    inputs = [inputs[key] for key in model._feed_input_names]
-  else:
-    inputs = training_utils.ModelInputs(inputs).as_list()
-
-  if mode == ModeKeys.PREDICT:
-    sample_weights = []
-    targets = []
-
-  ins = [inputs, targets, sample_weights]
-  if batch_index is not None:
-    ins.append(batch_index)
-  return tuple(ins)
-
-
-def _get_input_from_iterator(iterator, with_batch_index=False):
-  """Get elements from the iterator and verify the input shape and type."""
-  next_element = next(iterator)
-  if with_batch_index:
-    batch_index, next_element = next_element
-  else:
-    batch_index = None
-
-  if (tensor_util.is_tensor(next_element) or
-      isinstance(next_element, (dict, composite_tensor.CompositeTensor))):
-    next_element = [next_element]
-  if len(next_element) == 1:
-    x, = next_element
-    y = None
-    sample_weights = None
-  elif len(next_element) == 2:
-    x, y = next_element
-    sample_weights = None
-  else:
-    x, y, sample_weights = next_element
-
-  # Validate that all the elements in x and y are of the same type and shape.
-  dist_utils.validate_distributed_dataset_inputs(
-      distribution_strategy_context.get_strategy(), x, y, sample_weights)
-  return x, y, sample_weights, batch_index
-
-
-def _make_replica_execution_function(model, mode):
-  """A single step of the distributed execution on a replica."""
-  if mode == ModeKeys.TRAIN:
-    func = functools.partial(train_on_batch, model)
-  elif mode == ModeKeys.TEST:
-    func = functools.partial(test_on_batch, model)
-  else:
-    def _predict_on_batch(x, y=None, sample_weights=None, batch_index=None):
-      del y, sample_weights
-      # Note that the x and batch_index is already per-replica value.
-      result = predict_on_batch(model, x)
-      if batch_index is None:
-        return result
-      else:
-        return batch_index, result
-
-    func = _predict_on_batch
-
-  if mode != ModeKeys.PREDICT:
-    # `reset_metrics` is set to False to maintain stateful metrics across
-    # batch-level calls.
-    func = functools.partial(func, reset_metrics=False)
-
-  return func
-
-
-def _aggregate_predict_results(strategy, batch_outs, model):
-  """Aggregate the prediction result from each replica."""
-  num_replicas = strategy.num_replicas_in_sync
-  num_outputs = len(model.outputs)
-
-  if not isinstance(batch_outs, list):
-    batch_outs = [batch_outs]
-
-  with_batch_index = _should_add_batch_index_to_element(
-      strategy, ModeKeys.PREDICT)
-
-  # batch_outs is in following structure:
-  # [
-  #  replica_1_batch_index, replica_2_batch_index, ...., replica_x_batch_index,
-  #  replica_1_output_1, replica_2_output_1, ...., replica_x_output_1,
-  #  ......
-  #  replica_1_output_y, replica_2_output_y, ...., replica_x_output_y,
-  # ]
-  # The replica_x_batch_index is optional and depended on teh strategy type.
-  if with_batch_index:
-    batch_index, batch_outs = (batch_outs[:num_replicas],
-                               batch_outs[num_replicas:])
-    batch_index = dist_utils.concat_along_batch_dimension(batch_index)
-    # Reorder the batch_index for it to do proper gather. Eg, if the original
-    # index is [0, 2, 4, 6, 1, 3, 5, 7], then the index for gather should be
-    # [0, 4, 1, 5, 2, 6, 3, 7].
-    batch_index = np.argsort(batch_index)
-    # Only need to gather if the batch index is not sorted.
-    need_batch_index_gather = np.any(np.diff(batch_index) < 0)
-  else:
-    need_batch_index_gather = False
-
-  total_batch_outs = []
-  for i in range(num_outputs):
-    nested_outs = batch_outs[i * num_replicas:i * num_replicas + num_replicas]
-    per_output_result = dist_utils.concat_along_batch_dimension(
-        nest.flatten(nested_outs))
-
-    if need_batch_index_gather:
-      if _get_batch_size(per_output_result).numpy() == len(batch_index):
-        # Skip the gather if the output has a different batch size than the
-        # batch_index. There will be some error handling in upper layer.
-        per_output_result = _gather_result_by_index(per_output_result,
-                                                    batch_index)
-    total_batch_outs.append(per_output_result)
-  return total_batch_outs
-
-
-def _gather_result_by_index(input_tensor, batch_index):
-  """Handle the data element gather for different type of tensor."""
-  if isinstance(input_tensor, sparse_tensor.SparseTensor):
-    # For sparse tensor, both the index and value component should be gathered.
-    return sparse_tensor.SparseTensor(
-        indices=array_ops.gather_v2(input_tensor.indices, batch_index),
-        values=array_ops.gather_v2(input_tensor.values, batch_index),
-        dense_shape=input_tensor.dense_shape
-    )
-  # For both ragged tensor or eager tensor or np array, tf.gather should do the
-  # correct thing.
-  elif isinstance(input_tensor, ragged_tensor.RaggedTensor):
-    return array_ops.gather_v2(input_tensor, batch_index)
-  elif isinstance(input_tensor, (ops.EagerTensor, np.ndarray)):
-    return array_ops.gather_v2(input_tensor, batch_index).numpy()
-  else:
-    raise ValueError('Unexpected type {} encountered when gathering '
-                     'batch slices.'.format(input_tensor))
-
-
-def _get_batch_size(inputs):
-  first_inputs = nest.flatten(inputs)[0]
-  if isinstance(first_inputs, ragged_tensor.RaggedTensor):
-    return first_inputs.bounding_shape()[0]
-  else:
-    return array_ops.shape(first_inputs)[0]
-
-
-def _add_batch_index_to_element(dataset):
-  """Adding a new batch index field to the every element in the batch.
-
-  This is need in the model.predict() when running with multi-worker
-  distribution strategy. When sharding/distributing a dataset, the continuity of
-  the sharded dataset can't be easily ensured without performance sacrifice. It
-  is fine to train and eval with the reordered data, but not for prediction. To
-  solve this issue, Keras will add a batch index to each of the element in the
-  dataset, which will then pass to pre-replica execution function. The real
-  execution function will remove it before feeding the input to the model, and
-  pre-replica function will then zip the index with the result. Finally Keras
-  will sort the batch result based on the added batch-index field, remove it and
-  return the sorted result.
-
-  Note that we didn't add single index to the per-replica batch, but to each of
-  the element in the batch, since we can't ensure the data in pre-replica is
-  continuous. Eg: model with 2 replica and predict with 4 elements per batch
-  like [1, 2, 3, 4], it is possible to shard as [1, 2], [3, 4],
-  or [1, 3], [2, 4].
-
-  Args:
-    dataset: a dataset that is created by any of the data_adapter, with the
-    element structure as (x, y, sample_weights).
-
-  Returns:
-    a new dataset, with the element shape as
-    (batch_index, (x, y, sample_weights)).
-  """
-  return dataset.map(lambda *inp: (math_ops.range(_get_batch_size(inp)), inp))
-
-
-def _should_add_batch_index_to_element(strategy, mode):
-  """Whether or not the batch index should be added to the input dataset.
-
-  See docstring of _add_batch_index_to_element() for more details. So far the
-  batch index is only need when using TPUStrategy with a multi-worker setting.
-  We will try to avoid adding batch index for other cases since it has the
-  performance implication.
-
-  Args:
-    strategy: the current distribution strategy for the model.
-    mode: the current mode (Training/Eval/Predict) for the model.
-  Returns:
-    Boolean, whether the batch index should be added for the input data to
-      preserve the ordering.
-  """
-  # TODO(priyag, rxsang): Come up a better way to determine when the batch index
-  # should be added.
-  return (mode == ModeKeys.PREDICT
-          and dist_utils.is_tpu_strategy(strategy)
-          and strategy.extended.num_hosts > 1)
-
-
-def train_on_batch(
-    model,
-    x,
-    y=None,
-    sample_weight=None,
-    class_weight=None,
-    reset_metrics=True,
-    standalone=False):
-  """Runs a single gradient update on a single batch of data.
-
-  Arguments:
-      model: The model to train.
-      x: Input data. It could be:
-        - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
-        - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
-        - A dict mapping input names to the corresponding array/tensors,
-            if the model has named inputs.
-        - A `tf.data` dataset.
-      y: Target data. Like the input data `x`, it could be either Numpy
-        array(s) or TensorFlow tensor(s). It should be consistent with `x`
-        (you cannot have Numpy inputs and tensor targets, or inversely). If
-        `x` is a dataset `y` should not be specified
-        (since targets will be obtained from the iterator).
-      sample_weight: Optional array of the same length as x, containing
-        weights to apply to the model's loss for each sample. In the case of
-        temporal data, you can pass a 2D array with shape (samples,
-        sequence_length), to apply a different weight to every timestep of
-        every sample. In this case you should make sure to specify
-        sample_weight_mode="temporal" in compile(). This argument is not
-        supported when `x` is a dataset.
-      class_weight: Optional dictionary mapping class indices (integers) to a
-        weight (float) to apply to the model's loss for the samples from this
-        class during training. This can be useful to tell the model to "pay
-        more attention" to samples from an under-represented class.
-      reset_metrics: If `True`, the metrics returned will be only for this
-        batch. If `False`, the metrics will be statefully accumulated across
-        batches.
-      standalone: If True, this method is not called as part of
-        Model.fit/evaluate/predict and can therefore be tf.function'd.
-
-  Returns:
-      Scalar training loss
-      (if the model has a single output and no metrics)
-      or list of scalars (if the model has multiple outputs
-      and/or metrics). The attribute `model.metrics_names` will give you
-      the display labels for the scalar outputs.
-
-  Raises:
-    ValueError: In case of invalid user-provided arguments.
-  """
-  model._assert_compile_was_called()
-
-  # TODO(scottzhu): Standardization should happen in the data handlers,
-  ## not on a per batch basis in the *_on_batch methods
-  # Validate and standardize user data.
-  x, y, sample_weights = model._standardize_user_data(
-      x, y, sample_weight=sample_weight, class_weight=class_weight,
-      extract_tensors_from_dataset=True)
-  batch_size = array_ops.shape(nest.flatten(x, expand_composites=True)[0])[0]
-  # If `model._distribution_strategy` is True, then we are in a replica context
-  # at this point because of the check above.  `train_on_batch` is being run
-  # for each replica by `model._distribution_strategy` and the same code path
-  # as Eager is expected to be taken.
-
-  if standalone:
-    train_on_batch_fn = _get_or_make_on_batch_function(model, ModeKeys.TRAIN)
-  else:
-    train_on_batch_fn = training_eager.train_on_batch
-
-  outputs = train_on_batch_fn(
-      model,
-      x,
-      y,
-      sample_weights=sample_weights,
-      output_loss_metrics=model._output_loss_metrics)
-
-  if reset_metrics:
-    model.reset_metrics()
-
-  outputs['batch_size'] = math_ops.cast(batch_size, dtypes.int64)
-  return outputs
-
-
-def test_on_batch(model, x, y=None, sample_weight=None, reset_metrics=True,
-                  standalone=False):
-  """Test the model on a single batch of samples.
-
-  Arguments:
-      model: The model to test.
-      x: Input data. It could be:
-        - A Numpy array (or array-like), or a list of arrays
-          (in case the model has multiple inputs).
-        - A TensorFlow tensor, or a list of tensors
-          (in case the model has multiple inputs).
-        - A dict mapping input names to the corresponding array/tensors,
-          if the model has named inputs.
-        - A `tf.data` dataset.
-      y: Target data. Like the input data `x`,
-        it could be either Numpy array(s) or TensorFlow tensor(s).
-        It should be consistent with `x` (you cannot have Numpy inputs and
-        tensor targets, or inversely). If `x` is a dataset,
-        `y` should not be specified
-        (since targets will be obtained from the iterator).
-      sample_weight: Optional array of the same length as x, containing
-          weights to apply to the model's loss for each sample.
-          In the case of temporal data, you can pass a 2D array
-          with shape (samples, sequence_length),
-          to apply a different weight to every timestep of every sample.
-          In this case you should make sure to specify
-          sample_weight_mode="temporal" in compile(). This argument is not
-          supported when `x` is a dataset.
-      reset_metrics: If `True`, the metrics returned will be only for this
-        batch. If `False`, the metrics will be statefully accumulated across
-        batches.
-      standalone: If True, this method is not called as part of
-        Model.fit/evaluate/predict and can therefore be tf.function'd.
-
-  Returns:
-      Scalar test loss (if the model has a single output and no metrics)
-      or list of scalars (if the model has multiple outputs
-      and/or metrics). The attribute `model.metrics_names` will give you
-      the display labels for the scalar outputs.
-
-  Raises:
-      ValueError: In case of invalid user-provided arguments.
-  """
-  model._assert_compile_was_called()
-
-  # TODO(scottzhu): Standardization should happen in the data handlers,
-  ## not on a per batch basis in the *_on_batch methods
-  # Validate and standardize user data.
-  x, y, sample_weights = model._standardize_user_data(
-      x, y, sample_weight=sample_weight, extract_tensors_from_dataset=True)
-
-  batch_size = array_ops.shape(nest.flatten(x, expand_composites=True)[0])[0]
-
-  if standalone:
-    test_on_batch_fn = _get_or_make_on_batch_function(model, ModeKeys.TEST)
-  else:
-    test_on_batch_fn = training_eager.test_on_batch
-
-  outputs = test_on_batch_fn(
-      model,
-      x,
-      y,
-      sample_weights=sample_weights,
-      output_loss_metrics=model._output_loss_metrics)
-
-  if reset_metrics:
-    model.reset_metrics()
-
-  outputs['batch_size'] = math_ops.cast(batch_size, dtypes.int64)
-  return outputs
-
-
-def predict_on_batch(model, x, standalone=False):
-  """Returns predictions for a single batch of samples.
-
-  Arguments:
-      model: The model to predict with.
-      x: Input data. It could be:
-        - A Numpy array (or array-like), or a list of arrays
-          (in case the model has multiple inputs).
-        - A TensorFlow tensor, or a list of tensors
-          (in case the model has multiple inputs).
-        - A `tf.data` dataset.
-      standalone: If True, this method is not called as part of
-        Model.fit/evaluate/predict and can therefore be tf.function'd.
-
-  Returns:
-      Numpy array(s) of predictions.
-
-  Raises:
-      ValueError: In case of mismatch between given number of inputs and
-        expectations of the model.
-  """
-  # TODO(scottzhu): Standardization should happen in the data handlers,
-  ## not on a per batch basis in the *_on_batch methods
-  # Validate and standardize user data.
-  inputs, _, _ = model._standardize_user_data(
-      x, extract_tensors_from_dataset=True)
-
-  # If `model._distribution_strategy` is True, then we are in a replica context
-  # at this point.
-  inputs = training_utils.cast_to_model_input_dtypes(inputs, model)
-  if isinstance(inputs, collections.Sequence):
-    # Unwrap lists with only one input, as we do when training on batch
-    if len(inputs) == 1:
-      inputs = inputs[0]
-
-  if standalone:
-    predict_on_batch_fn = _get_or_make_on_batch_function(
-        model, ModeKeys.PREDICT)
-  else:
-    predict_on_batch_fn = model
-
-  with backend.eager_learning_phase_scope(0):
-    return predict_on_batch_fn(inputs)  # pylint: disable=not-callable
diff --git a/tensorflow/python/keras/engine/training_v2_utils_test.py b/tensorflow/python/keras/engine/training_v2_utils_test.py
deleted file mode 100644
index 4499ad3c8c6..00000000000
--- a/tensorflow/python/keras/engine/training_v2_utils_test.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tensorflow.python.keras.engine.training_v2_utils."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-from absl.testing import parameterized
-import mock
-import numpy as np
-
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import mirrored_strategy
-from tensorflow.python.distribute import strategy_combinations
-from tensorflow.python.eager import def_function
-from tensorflow.python.framework import combinations
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils
-from tensorflow.python.keras.engine import training_v2_utils
-from tensorflow.python.keras.utils.mode_keys import ModeKeys
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.platform import test
-
-
-class AggregatePredictResultsTest(test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super(AggregatePredictResultsTest, self).setUp()
-    strategy_combinations.set_virtual_cpus_to_at_least(3)
-    self.num_replica = 3
-    self.batch_size = 16
-    self.dense_shape = (2, 3)
-    self.total_sample = 2 * self.batch_size
-
-    mock_model = collections.namedtuple('Model', ['outputs'])
-    self.mock_model = mock_model([1])
-
-    strategy = mirrored_strategy.MirroredStrategy(
-        ['/cpu:0', '/cpu:1', '/cpu:2'])
-
-    execution_function = lambda *inp: inp
-    @def_function.function
-    def predict_loop(batch):
-      batch_result = strategy.experimental_run_v2(execution_function, batch)
-      batch_result = dist_utils.unwrap_output_dict(
-          strategy, batch_result, ModeKeys.PREDICT)
-      # swap the order of replica 1 and 2, to mimic random order.
-      batch_result[2], batch_result[1] = batch_result[1], batch_result[2]
-      batch_result[5], batch_result[4] = batch_result[4], batch_result[5]
-      return batch_result
-
-    self.strategy = strategy
-    self.predict_loop = predict_loop
-
-  @combinations.generate(combinations.combine(tf_api_version=[1, 2],
-                                              mode='eager'))
-  def test_aggregate_predict_results_dense(self):
-    dataset = dataset_ops.Dataset.range(self.total_sample)
-    def dense_map_fn(i):
-      # Mimic what we do for adding batch index
-      return i, array_ops.fill(self.dense_shape, i)
-    dense_dataset = dataset.map(dense_map_fn).batch(self.batch_size)
-    distributed_data = self.strategy.experimental_distribute_dataset(
-        dense_dataset)
-
-    start = 0
-    for batch in distributed_data:
-      with mock.patch.object(training_v2_utils,
-                             '_should_add_batch_index_to_element',
-                             fake_should_add_batch_index_to_element):
-        batch_result = self.predict_loop(batch)
-        final_result = training_v2_utils._aggregate_predict_results(
-            self.strategy, batch_result, self.mock_model)
-
-        # Make sure the dense result is in a sorted order.
-        expected_result = np.arange(
-            start=start, stop=start+self.batch_size).reshape((-1, 1))
-        expected_result = np.tile(expected_result, 6).reshape(
-            (-1,) + self.dense_shape)
-        self.assertAllClose(final_result[0], expected_result)
-        start += self.batch_size
-
-  @combinations.generate(combinations.combine(tf_api_version=[1, 2],
-                                              mode='eager'))
-  def test_aggregate_predict_results_sparse(self):
-    dataset = dataset_ops.Dataset.range(self.total_sample)
-    def sparse_map_fn(i):
-      return i, sparse_tensor.SparseTensor(
-          indices=[(0, 0)],
-          values=[i],
-          dense_shape=self.dense_shape)
-    sparse_dataset = dataset.map(sparse_map_fn).batch(self.batch_size)
-    distributed_data = self.strategy.experimental_distribute_dataset(
-        sparse_dataset)
-
-    start = 0
-    for batch in distributed_data:
-      with mock.patch.object(training_v2_utils,
-                             '_should_add_batch_index_to_element',
-                             fake_should_add_batch_index_to_element):
-        batch_result = self.predict_loop(batch)
-        final_result = training_v2_utils._aggregate_predict_results(
-            self.strategy, batch_result, self.mock_model)
-
-        # Make sure the dense result is in a sorted order.
-        expected_values = np.arange(start=start, stop=start+self.batch_size)
-        self.assertAllClose(final_result[0].values, expected_values)
-        start += self.batch_size
-
-  @combinations.generate(combinations.combine(tf_api_version=[1, 2],
-                                              mode='eager'))
-  def test_aggregate_predict_results_ragged(self):
-    dataset = dataset_ops.Dataset.range(self.total_sample)
-    def ragged_map_fn(i):
-      return i, ragged_factory_ops.constant([[0], [], []], dtype=np.int64) + i
-    ragged_dataset = dataset.map(ragged_map_fn).batch(self.batch_size)
-    distributed_data = self.strategy.experimental_distribute_dataset(
-        ragged_dataset)
-
-    start = 0
-    for batch in distributed_data:
-      with mock.patch.object(training_v2_utils,
-                             '_should_add_batch_index_to_element',
-                             fake_should_add_batch_index_to_element):
-        batch_result = self.predict_loop(batch)
-        final_result = training_v2_utils._aggregate_predict_results(
-            self.strategy, batch_result, self.mock_model)
-
-        # Make sure the dense result is in a sorted order.
-        expected_values = np.arange(start=start, stop=start+self.batch_size)
-        self.assertAllClose(final_result[0].flat_values, expected_values)
-        start += self.batch_size
-
-
-def fake_should_add_batch_index_to_element(strategy, mode):
-  # Ignore the strategy instance check since we were using the MirroredStrategy
-  # for testing.
-  del strategy
-  return mode == ModeKeys.PREDICT
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index 13134927409..65aadd7cd08 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -1122,12 +1122,17 @@ class Dense(Layer):
       raise TypeError('Unable to build `Dense` layer with non-floating point '
                       'dtype %s' % (dtype,))
     input_shape = tensor_shape.TensorShape(input_shape)
-    if tensor_shape.dimension_value(input_shape[-1]) is None:
-      raise ValueError('The last dimension of the inputs to `Dense` '
-                       'should be defined. Found `None`.')
-    last_dim = tensor_shape.dimension_value(input_shape[-1])
-    self.input_spec = InputSpec(min_ndim=2,
-                                axes={-1: last_dim})
+    # Handle 1-d inputs by reshaping to (-1, 1).
+    if input_shape.rank == 1:
+      input_shape = tensor_shape.TensorShape(input_shape.as_list() + [1])
+      last_dim = tensor_shape.dimension_value(1)
+      self.input_spec = InputSpec(min_ndim=1, max_ndim=2)
+    else:
+      if tensor_shape.dimension_value(input_shape[-1]) is None:
+        raise ValueError('The last dimension of the inputs to `Dense` '
+                         'should be defined. Found `None`.')
+      last_dim = tensor_shape.dimension_value(input_shape[-1])
+      self.input_spec = InputSpec(min_ndim=2, axes={-1: last_dim})
     self.kernel = self.add_weight(
         'kernel',
         shape=[last_dim, self.units],
@@ -1160,6 +1165,8 @@ class Dense(Layer):
         output_shape = shape[:-1] + [self.units]
         outputs.set_shape(output_shape)
     else:
+      if rank == 1:
+        inputs = array_ops.expand_dims_v2(inputs, axis=-1)
       inputs = math_ops.cast(inputs, self._compute_dtype)
       if K.is_sparse(inputs):
         outputs = sparse_ops.sparse_tensor_dense_matmul(inputs, self.kernel)
diff --git a/tensorflow/python/keras/layers/merge.py b/tensorflow/python/keras/layers/merge.py
index bf39f30b71a..57a97952e4f 100644
--- a/tensorflow/python/keras/layers/merge.py
+++ b/tensorflow/python/keras/layers/merge.py
@@ -89,7 +89,7 @@ class _Merge(Layer):
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
     # Used purely for shape validation.
-    if not isinstance(input_shape, list):
+    if not isinstance(input_shape[0], tuple):
       raise ValueError('A merge layer should be called on a list of inputs.')
     if len(input_shape) < 2:
       raise ValueError('A merge layer should be called '
@@ -118,7 +118,7 @@ class _Merge(Layer):
       self._reshape_required = True
 
   def call(self, inputs):
-    if not isinstance(inputs, list):
+    if not isinstance(inputs, (list, tuple)):
       raise ValueError('A merge layer should be called on a list of inputs.')
     if self._reshape_required:
       reshaped_inputs = []
@@ -204,9 +204,9 @@ class _Merge(Layer):
   def compute_mask(self, inputs, mask=None):
     if mask is None:
       return None
-    if not isinstance(mask, list):
+    if not isinstance(mask, (tuple, list)):
       raise ValueError('`mask` should be a list.')
-    if not isinstance(inputs, list):
+    if not isinstance(inputs, (tuple, list)):
       raise ValueError('`inputs` should be a list.')
     if len(mask) != len(inputs):
       raise ValueError('The lists `inputs` and `mask` '
@@ -489,7 +489,7 @@ class Concatenate(_Merge):
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
     # Used purely for shape validation.
-    if not isinstance(input_shape, list) or len(input_shape) < 2:
+    if not isinstance(input_shape[0], tuple) or len(input_shape) < 2:
       raise ValueError('A `Concatenate` layer should be called '
                        'on a list of at least 2 inputs')
     if all(shape is None for shape in input_shape):
@@ -523,7 +523,7 @@ class Concatenate(_Merge):
 
   @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
-    if not isinstance(input_shape, list):
+    if not isinstance(input_shape, (tuple, list)):
       raise ValueError('A `Concatenate` layer should be called '
                        'on a list of inputs.')
     input_shapes = input_shape
@@ -538,9 +538,9 @@ class Concatenate(_Merge):
   def compute_mask(self, inputs, mask=None):
     if mask is None:
       return None
-    if not isinstance(mask, list):
+    if not isinstance(mask, (tuple, list)):
       raise ValueError('`mask` should be a list.')
-    if not isinstance(inputs, list):
+    if not isinstance(inputs, (tuple, list)):
       raise ValueError('`inputs` should be a list.')
     if len(mask) != len(inputs):
       raise ValueError('The lists `inputs` and `mask` '
@@ -656,7 +656,7 @@ class Dot(_Merge):
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
     # Used purely for shape validation.
-    if not isinstance(input_shape, list) or len(input_shape) != 2:
+    if not isinstance(input_shape[0], tuple) or len(input_shape) != 2:
       raise ValueError('A `Dot` layer should be called '
                        'on a list of 2 inputs.')
     shape1 = input_shape[0]
@@ -701,7 +701,7 @@ class Dot(_Merge):
 
   @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
-    if not isinstance(input_shape, list) or len(input_shape) != 2:
+    if not isinstance(input_shape, (tuple, list)) or len(input_shape) != 2:
       raise ValueError('A `Dot` layer should be called '
                        'on a list of 2 inputs.')
     shape1 = list(input_shape[0])
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index 5222a32857d..687b76dbe98 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -37,6 +37,7 @@ from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker_v2
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 
@@ -498,7 +499,8 @@ class NormalizationLayersGraphModeOnlyTest(
 
 def _run_layernorm_correctness_test(layer, dtype='float32'):
   model = keras.models.Sequential()
-  norm = layer(input_shape=(2, 2, 2))
+  model.add(keras.layers.Lambda(lambda x: math_ops.cast(x, dtype='float16')))
+  norm = layer(input_shape=(2, 2, 2), dtype=dtype)
   model.add(norm)
   model.compile(
       loss='mse',
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization_test.py b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
index e1573df3387..227e961751e 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
@@ -43,36 +43,40 @@ def get_layer_class():
 
 def _get_layer_computation_test_cases():
   test_cases = ({
-      "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]]),
+      "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]], dtype=np.float32),
       "axis": -1,
-      "test_data": np.array([[1.], [2.], [3.]]),
-      "expected": np.array([[-1.414214], [-.707107], [0]]),
+      "test_data": np.array([[1.], [2.], [3.]], np.float32),
+      "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
       "testcase_name": "2d_single_element"
   }, {
       "adapt_data":
-          np.array([[[1., 2., 3.], [2., 3., 4.]], [[3., 4., 5.], [4., 5.,
-                                                                  6.]]]),
+          np.array([[[1., 2., 3.], [2., 3., 4.]], [[3., 4., 5.], [4., 5., 6.]]],
+                   np.float32),
       "axis":
           1,
       "test_data":
-          np.array([[[1., 2., 3.], [2., 3., 4.]], [[3., 4., 5.], [4., 5.,
-                                                                  6.]]]),
+          np.array([[[1., 2., 3.], [2., 3., 4.]], [[3., 4., 5.], [4., 5., 6.]]],
+                   np.float32),
       "expected":
           np.array([[[-1.549193, -0.774597, 0.], [-1.549193, -0.774597, 0.]],
-                    [[0., 0.774597, 1.549193], [0., 0.774597, 1.549193]]]),
+                    [[0., 0.774597, 1.549193], [0., 0.774597, 1.549193]]],
+                   np.float32),
       "testcase_name":
           "3d_internal_axis"
   }, {
       "adapt_data":
-          np.array([[[1., 0., 3.], [2., 3., 4.]], [[3., -1., 5.], [4., 5.,
-                                                                   8.]]]),
+          np.array(
+              [[[1., 0., 3.], [2., 3., 4.]], [[3., -1., 5.], [4., 5., 8.]]],
+              np.float32),
       "axis": (1, 2),
       "test_data":
-          np.array([[[3., 1., -1.], [2., 5., 4.]], [[3., 0., 5.], [2., 5.,
-                                                                   8.]]]),
+          np.array(
+              [[[3., 1., -1.], [2., 5., 4.]], [[3., 0., 5.], [2., 5., 8.]]],
+              np.float32),
       "expected":
-          np.array([[[1., 3., -5.], [-1., 1., -1.]],
-                    [[1., 1., 1.], [-1., 1., 1.]]]),
+          np.array(
+              [[[1., 3., -5.], [-1., 1., -1.]], [[1., 1., 1.], [-1., 1., 1.]]],
+              np.float32),
       "testcase_name":
           "3d_multiple_axis"
   })
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index d3da18e703e..1a7886cf369 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -253,29 +253,28 @@ class TimeDistributedTest(keras_parameterized.TestCase):
         self.assertAllEqual(mask_outputs_val[i], ref_mask_val[i])
       self.assertIs(mask_outputs[-1], None)  # final layer
 
+  @tf_test_util.run_in_graph_and_eager_modes
   def test_TimeDistributed_with_masking_layer(self):
-    with self.cached_session():
-      # test with Masking layer
-      model = keras.models.Sequential()
-      model.add(keras.layers.TimeDistributed(keras.layers.Masking(
-          mask_value=0.,), input_shape=(None, 4)))
-      model.add(keras.layers.TimeDistributed(keras.layers.Dense(5)))
-      model.compile(optimizer='rmsprop', loss='mse')
-      model_input = np.random.randint(low=1, high=5, size=(10, 3, 4))
-      for i in range(4):
-        model_input[i, i:, :] = 0.
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.fit(model_input,
-                np.random.random((10, 3, 5)), epochs=1, batch_size=6)
-      mask_outputs = [model.layers[0].compute_mask(model.input)]
-      mask_outputs += [model.layers[1].compute_mask(model.layers[1].input,
-                                                    mask_outputs[-1])]
-      func = keras.backend.function([model.input], mask_outputs)
-      mask_outputs_val = func([model_input])
-      self.assertEqual((mask_outputs_val[0]).all(),
-                       model_input.all())
-      self.assertEqual((mask_outputs_val[1]).all(),
-                       model_input.all())
+    # test with Masking layer
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.TimeDistributed(
+            keras.layers.Masking(mask_value=0.,), input_shape=(None, 4)))
+    model.add(keras.layers.TimeDistributed(keras.layers.Dense(5)))
+    model.compile(optimizer='rmsprop', loss='mse')
+    model_input = np.random.randint(low=1, high=5, size=(10, 3, 4))
+    for i in range(4):
+      model_input[i, i:, :] = 0.
+    model.compile(optimizer='rmsprop', loss='mse')
+    model.fit(model_input, np.random.random((10, 3, 5)), epochs=1, batch_size=6)
+    mask_outputs = [model.layers[0].compute_mask(model.input)]
+    mask_outputs += [
+        model.layers[1].compute_mask(model.layers[1].input, mask_outputs[-1])
+    ]
+    func = keras.backend.function([model.input], mask_outputs)
+    mask_outputs_val = func([model_input])
+    self.assertEqual((mask_outputs_val[0]).all(), model_input.all())
+    self.assertEqual((mask_outputs_val[1]).all(), model_input.all())
 
   def test_TimeDistributed_with_different_time_shapes(self):
     time_dist = keras.layers.TimeDistributed(keras.layers.Dense(5))
@@ -574,9 +573,9 @@ class BidirectionalTest(test.TestCase, parameterized.TestCase):
       output = bidi_rnn(inputs)
       model = keras.models.Model(inputs, output)
 
-      y_1 = model.predict(x)
+      y_1 = model.predict(x, batch_size=1)
       model.reset_states()
-      y_2 = model.predict(x)
+      y_2 = model.predict(x, batch_size=1)
 
       self.assertAllClose(y_1, y_2)
 
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index 85731398ea7..061e31140b7 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -95,6 +95,17 @@ class Loss(object):
     # SUM_OVER_BATCH is only allowed in losses managed by `fit` or
     # CannedEstimators.
     self._allow_sum_over_batch_size = False
+    self._set_name_scope()
+
+  def _set_name_scope(self):
+    """Creates a valid `name_scope` name."""
+    if self.name is None:
+      self._name_scope = self.__class__.__name__
+    elif self.name == '<lambda>':
+      self._name_scope = 'lambda'
+    else:
+      # E.g. '_my_loss' => 'my_loss'
+      self._name_scope = self.name.strip('_')
 
   def __call__(self, y_true, y_pred, sample_weight=None):
     """Invokes the `Loss` instance.
@@ -124,10 +135,9 @@ class Loss(object):
     """
     # If we are wrapping a lambda function strip '<>' from the name as it is not
     # accepted in scope name.
-    scope_name = 'lambda' if self.name == '<lambda>' else self.name
     graph_ctx = tf_utils.graph_context_for_symbolic_tensors(
         y_true, y_pred, sample_weight)
-    with K.name_scope(scope_name or self.__class__.__name__), graph_ctx:
+    with K.name_scope(self._name_scope), graph_ctx:
       losses = self.call(y_true, y_pred)
       return losses_utils.compute_weighted_loss(
           losses, sample_weight, reduction=self._get_reduction())
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index bd0a8605135..1c851581a05 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -63,6 +63,7 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import util as tf_losses_utils
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
@@ -3220,11 +3221,7 @@ def clone_metric(metric):
 
 def clone_metrics(metrics):
   """Clones the given metric list/dict."""
-  if metrics is None:
-    return None
-  if isinstance(metrics, dict):
-    return {key: clone_metric(value) for key, value in metrics.items()}
-  return [clone_metric(metric) for metric in metrics]
+  return nest.map_structure(clone_metric, metrics)
 
 
 @keras_export('keras.metrics.serialize')
@@ -3243,6 +3240,7 @@ def deserialize(config, custom_objects=None):
 
 @keras_export('keras.metrics.get')
 def get(identifier):
+  """Return a metric given its identifer."""
   if isinstance(identifier, dict):
     return deserialize(identifier)
   elif isinstance(identifier, six.string_types):
@@ -3250,5 +3248,6 @@ def get(identifier):
   elif callable(identifier):
     return identifier
   else:
-    raise ValueError('Could not interpret '
-                     'metric function identifier: %s' % identifier)
+    error_msg = 'Could not interpret metric function identifier: {}'.format(
+        identifier)
+    raise ValueError(error_msg)
diff --git a/tensorflow/python/keras/metrics_correctness_test.py b/tensorflow/python/keras/metrics_correctness_test.py
index f372996141b..ea4222b6935 100644
--- a/tensorflow/python/keras/metrics_correctness_test.py
+++ b/tensorflow/python/keras/metrics_correctness_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python import tf2
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import losses
@@ -29,6 +28,7 @@ from tensorflow.python.keras import metrics
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops.losses import loss_reduction
 from tensorflow.python.platform import test
+from tensorflow.python.util import nest
 
 
 def get_multi_io_model():
@@ -51,13 +51,6 @@ def custom_generator_multi_io(sample_weights=None):
   inputs = np.asarray([[1.], [2.], [3.], [4.]])
   targets_1 = np.asarray([[2.], [4.], [6.], [8.]])
   targets_2 = np.asarray([[1.], [2.], [3.], [4.]])
-  if sample_weights:
-    assert len(sample_weights) == 2
-    w1 = sample_weights[0]
-    w2 = sample_weights[1]
-  else:
-    w1 = None
-    w2 = None
   i = 0
   while True:
     batch_index = i * batch_size % num_samples
@@ -67,17 +60,14 @@ def custom_generator_multi_io(sample_weights=None):
     x = [inputs[start:end], inputs[start:end]]
     y = [targets_1[start:end], targets_2[start:end]]
     if sample_weights:
-      w = [
-          None if w1 is None else w1[start:end],
-          None if w2 is None else w2[start:end]
-      ]
+      sw = nest.map_structure(lambda w: w[start:end], sample_weights)
     else:
-      w = None
-    yield x, y, w
+      sw = None
+    yield x, y, sw
 
 
 @keras_parameterized.run_with_all_model_types(exclude_models=['sequential'])
-@keras_parameterized.run_all_keras_modes
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
 class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
 
   def _get_compiled_multi_io_model(self):
@@ -100,8 +90,6 @@ class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
     self.y2 = np.asarray([[1.], [2.], [3.], [4.]])
     self.sample_weight_1 = np.asarray([2., 3., 4., 5.])
     self.sample_weight_2 = np.asarray([3.5, 2.5, 1.5, 0.5])
-    self.class_weight_1 = {2: 2, 4: 3, 6: 4, 8: 5}
-    self.class_weight_2 = {1: 3.5, 2: 2.5, 3: 1.5, 4: 0.5}
 
     # y_true_1 = [[2.], [4.], [6.], [8.]], y_pred = [[3.], [6.], [9.], [12.]]
     # y_true_2 = [[1.], [2.], [3.], [4.]], y_pred = [[3.], [6.], [9.], [12.]]
@@ -148,8 +136,6 @@ class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
     # Total loss without weights = 7.5 + 30 = 37.5
 
     self.wmse = 'mean_squared_error_2'
-    if not tf2.enabled():
-      self.wmse = 'weighted_' + self.wmse
     self.expected_fit_result_with_weights = {
         'output_1_mean_squared_error': [7.5, 7.5],
         'output_2_mean_squared_error': [30, 30],
@@ -223,29 +209,6 @@ class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
     for key, value in self.expected_fit_result_with_weights_output_2.items():
       self.assertAllClose(history.history[key], value, 1e-3)
 
-  def test_fit_with_class_weight(self):
-    model = self._get_compiled_multi_io_model()
-    history = model.fit([self.x, self.x], [self.y1, self.y2],
-                        class_weight={
-                            'output_1': self.class_weight_1,
-                            'output_2': self.class_weight_2,
-                        },
-                        batch_size=2,
-                        epochs=2,
-                        shuffle=False)
-    for key, value in self.expected_fit_result_with_weights.items():
-      self.assertAllClose(history.history[key], value, 1e-3)
-
-    # Set weights for one output.
-    history = model.fit([self.x, self.x], [self.y1, self.y2],
-                        class_weight={'output_2': self.class_weight_2},
-                        batch_size=2,
-                        epochs=2,
-                        shuffle=False)
-
-    for key, value in self.expected_fit_result_with_weights_output_2.items():
-      self.assertAllClose(history.history[key], value, 1e-3)
-
   def test_eval(self):
     model = self._get_compiled_multi_io_model()
     eval_result = model.evaluate([self.x, self.x], [self.y1, self.y2],
@@ -304,23 +267,6 @@ class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
     self.assertAllClose(result,
                         self.expected_batch_result_with_weights_output_2, 1e-3)
 
-  def test_train_on_batch_with_class_weight(self):
-    model = self._get_compiled_multi_io_model()
-    result = model.train_on_batch([self.x, self.x], [self.y1, self.y2],
-                                  class_weight={
-                                      'output_1': self.class_weight_1,
-                                      'output_2': self.class_weight_2,
-                                  })
-    self.assertAllClose(result, self.expected_batch_result_with_weights, 1e-3)
-
-    # Set weights for one output.
-    result = model.train_on_batch([self.x, self.x], [self.y1, self.y2],
-                                  class_weight={
-                                      'output_2': self.class_weight_2,
-                                  })
-    self.assertAllClose(result,
-                        self.expected_batch_result_with_weights_output_2, 1e-3)
-
   def test_test_on_batch(self):
     model = self._get_compiled_multi_io_model()
     result = model.test_on_batch([self.x, self.x], [self.y1, self.y2])
@@ -362,29 +308,8 @@ class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
 
     # Set weights for one output.
     history = model.fit_generator(
-        custom_generator_multi_io(sample_weights=[None, self.sample_weight_2]),
-        steps_per_epoch=2,
-        epochs=2)
-    for key, value in self.expected_fit_result_with_weights_output_2.items():
-      self.assertAllClose(history.history[key], value, 1e-3)
-
-  def test_fit_generator_with_class_weight(self):
-    model = self._get_compiled_multi_io_model()
-    history = model.fit_generator(
-        custom_generator_multi_io(),
-        class_weight={
-            'output_1': self.class_weight_1,
-            'output_2': self.class_weight_2,
-        },
-        steps_per_epoch=2,
-        epochs=2)
-    for key, value in self.expected_fit_result_with_weights.items():
-      self.assertAllClose(history.history[key], value, 1e-3)
-
-    # Set weights for one output.
-    history = model.fit_generator(
-        custom_generator_multi_io(),
-        class_weight={'output_2': self.class_weight_2},
+        custom_generator_multi_io(
+            sample_weights={'output_2': self.sample_weight_2}),
         steps_per_epoch=2,
         epochs=2)
     for key, value in self.expected_fit_result_with_weights_output_2.items():
@@ -406,14 +331,15 @@ class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
 
     # Set weights for one output.
     eval_result = model.evaluate_generator(
-        custom_generator_multi_io(sample_weights=[None, self.sample_weight_2]),
+        custom_generator_multi_io(
+            sample_weights={'output_2': self.sample_weight_2}),
         steps=2)
     self.assertAllClose(eval_result,
                         self.expected_batch_result_with_weights_output_2, 1e-3)
 
 
 @keras_parameterized.run_with_all_model_types
-@keras_parameterized.run_all_keras_modes
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
 class TestMetricsCorrectnessSingleIO(keras_parameterized.TestCase):
 
   def _get_model(self):
@@ -452,7 +378,8 @@ class TestMetricsCorrectnessSingleIO(keras_parameterized.TestCase):
     self.x = np.asarray([[1.], [2.], [3.], [4.]])
     self.y = np.asarray([[2.], [4.], [6.], [8.]])
     self.sample_weight = np.asarray([2., 3., 4., 5.])
-    self.class_weight = {2: 2, 4: 3, 6: 4, 8: 5}
+    self.class_weight = {i: 1 for i in range(10)}
+    self.class_weight.update({2: 2, 4: 3, 6: 4, 8: 5})
 
     # y_true = [[2.], [4.], [6.], [8.]], y_pred = [[3.], [6.], [9.], [12.]]
 
@@ -483,8 +410,6 @@ class TestMetricsCorrectnessSingleIO(keras_parameterized.TestCase):
     #   Result = 7.5
 
     wmse = 'mean_squared_error_2'
-    if not tf2.enabled():
-      wmse = 'weighted_' + wmse
 
     self.expected_fit_result_with_weights = {
         'mean_squared_error': [7.5, 7.5],
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index 7620f2f072e..0b0121f521e 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -552,6 +552,8 @@ def _reset_build_compile_trackers(model):
   model.outputs = None
   # Reset compile state
   model._is_compiled = False  # pylint:disable=protected-access
+  if not ops.executing_eagerly_outside_functions():
+    model._v1_compile_was_called = False
   model.optimizer = None
 
 
@@ -639,20 +641,23 @@ def clone_and_build_model(
         'Error when cloning model: compile_clone was set to True, but the '
         'original model has not been compiled.')
 
-  with CustomObjectScope(custom_objects or {}):
-    if model._is_graph_network or isinstance(model, Sequential):
-      clone = clone_model(model, input_tensors=input_tensors)
+  if compile_clone:
+    compile_args = model._get_compile_args()  # pylint: disable=protected-access
+    # Allows this method to be robust to switching graph and eager classes.
+    model._get_compile_args = lambda: compile_args
 
-      if all([
-          isinstance(clone, Sequential), not clone._is_graph_network,
-          getattr(model, '_build_input_shape', None) is not None
-      ]):
-        # Set model inputs to build the model and add input/output properties.
-        # TODO(kathywu): Add multiple placeholders to handle edge case where
-        # sequential model has multiple inputs.
-        clone._set_inputs(
-            K.placeholder(
-                model._build_input_shape, dtype=model.inputs[0].dtype))
+  with CustomObjectScope(custom_objects or {}):
+    if model._is_graph_network:
+      clone = clone_model(model, input_tensors=input_tensors)
+    elif isinstance(model, Sequential):
+      clone = clone_model(model, input_tensors=input_tensors)
+      if (not clone._is_graph_network and model._build_input_shape is not None):
+        if ops.executing_eagerly_outside_functions():
+          clone.build(model._build_input_shape)
+        else:
+          clone._set_inputs(
+              K.placeholder(
+                  model._build_input_shape, dtype=model.inputs[0].dtype))
     else:
       try:
         # Prefer clonining the model if serial/deserial logic is implemented for
@@ -704,14 +709,15 @@ def clone_and_build_model(
 
       if len(optimizer) == 1:
         optimizer = optimizer[0]
-    clone.compile(
-        optimizer,
-        model.loss,
-        metrics=metrics_module.clone_metrics(model._compile_metrics),
-        loss_weights=model.loss_weights,
-        sample_weight_mode=model.sample_weight_mode,
-        weighted_metrics=metrics_module.clone_metrics(
-            model._compile_weighted_metrics),
-        target_tensors=target_tensors)
+
+    compile_args['optimizer'] = optimizer
+    if target_tensors is not None:
+      compile_args['target_tensors'] = target_tensors
+    # Ensure Metric objects in new model are separate from existing model.
+    compile_args['metrics'] = metrics_module.clone_metrics(
+        compile_args['metrics'])
+    compile_args['weighted_metrics'] = metrics_module.clone_metrics(
+        compile_args['weighted_metrics'])
+    clone.compile(**compile_args)
 
   return clone
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index 3f9289b1021..8120afa0a55 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -412,8 +412,6 @@ class TestCloneAndBuildModel(keras_parameterized.TestCase):
         isinstance(model.optimizer,
                    (keras.optimizers.RMSprop,
                     keras.optimizer_v2.rmsprop.RMSprop)))
-    self.assertEqual(['acc', metrics.categorical_accuracy],
-                     model._compile_metrics)
 
   def _clone_and_build_test_helper(self, model, model_type):
     inp = np.random.random((10, 4))
@@ -500,15 +498,13 @@ class TestCloneAndBuildModel(keras_parameterized.TestCase):
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_replace_tf_optimizer_iterations_variable(self):
+    if context.executing_eagerly():
+      self.skipTest('v1 optimizers not supported with eager.')
     self.assert_optimizer_iterations_increases(adam.AdamOptimizer(0.01))
 
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_replace_keras_optimizer_iterations_variable(self):
-    if testing_utils.should_run_eagerly():
-      # This needs to be updated to run with v2 optimizers.
-      self.skipTest('b/120991591')
-
     self.assert_optimizer_iterations_increases('adam')
 
   def test_clone_optimizer_in_different_graph(self):
diff --git a/tensorflow/python/keras/premade/linear.py b/tensorflow/python/keras/premade/linear.py
index dd3e1fdfaeb..32300421afa 100644
--- a/tensorflow/python/keras/premade/linear.py
+++ b/tensorflow/python/keras/premade/linear.py
@@ -97,7 +97,7 @@ class LinearModel(training.Model):
 
   def build(self, input_shape):
     self.dense_layers = []
-    if isinstance(input_shape, list):
+    if isinstance(input_shape, (tuple, list)):
       for shape in input_shape:
         layer = core.Dense(
             units=self.units,
diff --git a/tensorflow/python/keras/premade/wide_deep.py b/tensorflow/python/keras/premade/wide_deep.py
index ba524367bc6..2f339786c67 100644
--- a/tensorflow/python/keras/premade/wide_deep.py
+++ b/tensorflow/python/keras/premade/wide_deep.py
@@ -18,10 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import backprop
+from tensorflow.python.framework import ops
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import layers as layer_module
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import data_adapter
 from tensorflow.python.keras.engine import training as keras_training
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.util import nest
@@ -106,25 +109,38 @@ class WideDeepModel(keras_training.Model):
       return nest.map_structure(self.activation, output)
     return output
 
-  def _get_optimizers(self):
-    if isinstance(self.optimizer, (tuple, list)):
-      return (self.optimizer[0], self.optimizer[1])
-    else:
-      return (self.optimizer, self.optimizer)
-
   # This does not support gradient scaling and LossScaleOptimizer.
-  def _backwards(self, tape, loss):
-    linear_vars = self.linear_model.trainable_weights  # pylint: disable=protected-access
-    dnn_vars = self.dnn_model.trainable_weights  # pylint: disable=protected-access
-    linear_grads, dnn_grads = tape.gradient(loss, (linear_vars, dnn_vars))
-    linear_optimizer, dnn_optimizer = self._get_optimizers()
-    linear_optimizer.apply_gradients(zip(linear_grads, linear_vars))
-    dnn_optimizer.apply_gradients(zip(dnn_grads, dnn_vars))
-    return
+  def _train_step(self, data):
+    x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
+    x, y, sample_weight = data_adapter.expand_1d((x, y, sample_weight))
+
+    with backprop.GradientTape() as tape:
+      y_pred = self(x, training=True)
+      loss = self.compiled_loss(
+          y, y_pred, sample_weight, regularization_losses=self.losses)
+    self.compiled_metrics.update_state(y, y_pred, sample_weight)
+
+    if isinstance(self.optimizer, (list, tuple)):
+      linear_vars = self.linear_model.trainable_variables
+      dnn_vars = self.dnn_model.trainable_variables
+      linear_grads, dnn_grads = tape.gradient(loss, (linear_vars, dnn_vars))
+
+      linear_optimizer = self.optimizer[0]
+      dnn_optimizer = self.optimizer[1]
+      linear_optimizer.apply_gradients(zip(linear_grads, linear_vars))
+      dnn_optimizer.apply_gradients(zip(dnn_grads, dnn_vars))
+    else:
+      trainable_variables = self.trainable_variables
+      grads = tape.gradient(loss, trainable_variables)
+      self.optimizer.apply_gradients(zip(grads, trainable_variables))
+
+    return {m.name: m.result() for m in self.metrics}
 
   def _make_train_function(self):
-    # TODO(tanzheny): This is a direct copy from super to make it work
-    # refactor it so that common logic can be shared.
+    if ops.executing_eagerly_outside_functions():
+      return super(WideDeepModel, self)._make_train_function()
+
+    # Only needed for graph mode and model_to_estimator.
     has_recompiled = self._recompile_weights_loss_and_weighted_metrics()
     self._check_trainable_weights_consistency()
     # If we have re-compiled the loss/weighted metric sub-graphs then create
@@ -140,7 +156,13 @@ class WideDeepModel(keras_training.Model):
       if not isinstance(K.symbolic_learning_phase(), int):
         inputs += [K.symbolic_learning_phase()]
 
-      linear_optimizer, dnn_optimizer = self._get_optimizers()
+      if isinstance(self.optimizer, (list, tuple)):
+        linear_optimizer = self.optimizer[0]
+        dnn_optimizer = self.optimizer[1]
+      else:
+        linear_optimizer = self.optimizer
+        dnn_optimizer = self.optimizer
+
       with K.get_graph().as_default():
         with K.name_scope('training'):
           # Training updates
diff --git a/tensorflow/python/keras/premade/wide_deep_test.py b/tensorflow/python/keras/premade/wide_deep_test.py
index e2f471e3575..3b58984bd11 100644
--- a/tensorflow/python/keras/premade/wide_deep_test.py
+++ b/tensorflow/python/keras/premade/wide_deep_test.py
@@ -258,8 +258,6 @@ class WideDeepModelTest(keras_parameterized.TestCase):
         run_eagerly=testing_utils.should_run_eagerly(),
         experimental_run_tf_function=testing_utils.should_run_tf_function())
     wide_deep_model.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
-    self.assertEqual(3, linear_model.inputs[0].shape[1])
-    self.assertEqual(5, dnn_model.inputs[0].shape[1])
 
   def test_config(self):
     linear_model = linear.LinearModel(units=1)
diff --git a/tensorflow/python/keras/saving/hdf5_format_test.py b/tensorflow/python/keras/saving/hdf5_format_test.py
index 66a712c4f2e..6c94ed50517 100644
--- a/tensorflow/python/keras/saving/hdf5_format_test.py
+++ b/tensorflow/python/keras/saving/hdf5_format_test.py
@@ -818,19 +818,23 @@ class TestWholeModelSaving(test.TestCase, parameterized.TestCase):
         evaluation_results['sparse_categorical_crossentropy'] +
         evaluation_results['custom_loss'], evaluation_results['loss'], 1e-6)
 
+  @test_util.run_in_graph_and_eager_modes
   def test_save_uncompiled_model_with_optimizer(self):
-    saved_model_dir = self._save_model_dir()
-    save_format = testing_utils.get_save_format()
-    model = keras.models.Sequential([keras.layers.Dense(1, input_shape=(3,))])
-    # Set the model's optimizer but don't compile. This can happen if the model
-    # is trained with a custom training loop.
-    model.optimizer = keras.optimizer_v2.rmsprop.RMSprop(lr=0.0001)
-    model.save(saved_model_dir, save_format=save_format)
+    with self.cached_session() as session:
+      saved_model_dir = self._save_model_dir()
+      save_format = testing_utils.get_save_format()
+      model = keras.models.Sequential([keras.layers.Dense(1, input_shape=(3,))])
+      # Set the model's optimizer but don't compile. This can happen if the
+      # model is trained with a custom training loop.
+      model.optimizer = keras.optimizer_v2.rmsprop.RMSprop(lr=0.0001)
+      if not context.executing_eagerly():
+        session.run([v.initializer for v in model.variables])
+      model.save(saved_model_dir, save_format=save_format)
 
-    if save_format in ['tf', 'tensorflow']:
-      loaded = keras.models.load_model(saved_model_dir)
-      self.assertIsInstance(loaded.optimizer,
-                            keras.optimizer_v2.optimizer_v2.OptimizerV2)
+      if save_format in ['tf', 'tensorflow']:
+        loaded = keras.models.load_model(saved_model_dir)
+        self.assertIsInstance(loaded.optimizer,
+                              keras.optimizer_v2.optimizer_v2.OptimizerV2)
 
 
 # Factory functions to create models that will be serialized inside a Network.
diff --git a/tensorflow/python/keras/saving/losses_serialization_test.py b/tensorflow/python/keras/saving/losses_serialization_test.py
index 60252b1dbf4..8bdcc2a794d 100644
--- a/tensorflow/python/keras/saving/losses_serialization_test.py
+++ b/tensorflow/python/keras/saving/losses_serialization_test.py
@@ -48,11 +48,11 @@ class MyMeanAbsoluteError(losses.LossFunctionWrapper):
                reduction=losses_utils.ReductionV2.AUTO,
                name='mean_absolute_error'):
     super(MyMeanAbsoluteError, self).__init__(
-        _my_mae, name=name, reduction=reduction)
+        my_mae, name=name, reduction=reduction)
 
 
 # Custom loss function
-def _my_mae(y_true, y_pred):
+def my_mae(y_true, y_pred):
   return keras.backend.mean(math_ops.abs(y_pred - y_true), axis=-1)
 
 
@@ -70,7 +70,7 @@ def _get_multi_io_model():
     dict(testcase_name='string', value='mae'),
     dict(testcase_name='built_in_fn', value=losses.mae),
     dict(testcase_name='built_in_class', value=losses.MeanAbsoluteError()),
-    dict(testcase_name='custom_fn', value=_my_mae),
+    dict(testcase_name='custom_fn', value=my_mae),
     dict(testcase_name='custom_class', value=MyMeanAbsoluteError()),
     dict(testcase_name='list_of_strings', value=['mae', 'mae']),
     dict(testcase_name='list_of_built_in_fns', value=[losses.mae, losses.mae]),
@@ -78,7 +78,7 @@ def _get_multi_io_model():
         testcase_name='list_of_built_in_classes',
         value=[losses.MeanAbsoluteError(),
                losses.MeanAbsoluteError()]),
-    dict(testcase_name='list_of_custom_fns', value=[_my_mae, _my_mae]),
+    dict(testcase_name='list_of_custom_fns', value=[my_mae, my_mae]),
     dict(
         testcase_name='list_of_custom_classes',
         value=[MyMeanAbsoluteError(),
@@ -104,8 +104,8 @@ def _get_multi_io_model():
     dict(
         testcase_name='dict_of_custom_fn',
         value={
-            'output': _my_mae,
-            'output_1': _my_mae
+            'output': my_mae,
+            'output_1': my_mae
         }),
     dict(
         testcase_name='dict_of_custom_class',
@@ -128,7 +128,7 @@ class LossesSerialization(keras_parameterized.TestCase):
   def test_serializing_model_with_loss_with_custom_object_scope(self, value):
     with generic_utils.custom_object_scope({
         'MyMeanAbsoluteError': MyMeanAbsoluteError,
-        '_my_mae': _my_mae,
+        'my_mae': my_mae,
         'Bias': testing_utils.Bias,
     }):
       model = _get_multi_io_model()
@@ -182,7 +182,7 @@ class LossesSerialization(keras_parameterized.TestCase):
         self.model_filename,
         custom_objects={
             'MyMeanAbsoluteError': MyMeanAbsoluteError,
-            '_my_mae': _my_mae,
+            'my_mae': my_mae,
             'Bias': testing_utils.Bias,
         })
     loaded_model.predict([self.x, self.x])
diff --git a/tensorflow/python/keras/saving/metrics_serialization_test.py b/tensorflow/python/keras/saving/metrics_serialization_test.py
index 10eee4d4175..7ecc2e5b087 100644
--- a/tensorflow/python/keras/saving/metrics_serialization_test.py
+++ b/tensorflow/python/keras/saving/metrics_serialization_test.py
@@ -69,17 +69,6 @@ def _get_multi_io_model():
     dict(testcase_name='built_in_class', value=[metrics.MeanAbsoluteError]),
     dict(testcase_name='custom_fn', value=[_my_mae]),
     dict(testcase_name='custom_class', value=[MyMeanAbsoluteError]),
-    dict(testcase_name='list_of_strings', value=['mae', 'mae']),
-    dict(
-        testcase_name='list_of_built_in_fns', value=[metrics.mae, metrics.mae]),
-    dict(
-        testcase_name='list_of_built_in_classes',
-        value=[metrics.MeanAbsoluteError, metrics.MeanAbsoluteError]),
-    dict(testcase_name='list_of_custom_fns', value=[_my_mae, _my_mae]),
-    dict(
-        testcase_name='list_of_custom_classes',
-        value=[MyMeanAbsoluteError, MyMeanAbsoluteError]),
-    dict(testcase_name='list_of_string_and_list', value=['mae', ['mae']]),
     dict(
         testcase_name='list_of_built_in_fn_and_list',
         value=[metrics.mae, [metrics.mae]]),
diff --git a/tensorflow/python/keras/saving/saved_model/load.py b/tensorflow/python/keras/saving/saved_model/load.py
index 0aac128eb43..d53530ec1d7 100644
--- a/tensorflow/python/keras/saving/saved_model/load.py
+++ b/tensorflow/python/keras/saving/saved_model/load.py
@@ -445,8 +445,11 @@ class KerasObjectLoader(tf_load.Loader):
       model.__init__(layers, name=config['name'])
       if not model.inputs:
         first_layer = self._get_child_layer_node_ids(model_id, model.name)[0]
-        input_shape = self._infer_inputs(first_layer)
-        model._set_inputs(input_shape)  # pylint: disable=protected-access
+        input_specs = self._infer_inputs(first_layer)
+        input_shapes = self._infer_inputs(first_layer, convert_to_shapes=True)
+        model._set_inputs(input_specs)  # pylint: disable=protected-access
+        if not model.built and not isinstance(input_specs, dict):
+          model.build(input_shapes)
     else:
       (inputs, outputs, created_layers) = network_lib.reconstruct_from_config(
           config, created_layers={layer.name: layer for layer in layers})
diff --git a/tensorflow/python/keras/saving/saved_model/revive_test.py b/tensorflow/python/keras/saving/saved_model/revive_test.py
index 36140e7fe20..3e267340caa 100644
--- a/tensorflow/python/keras/saving/saved_model/revive_test.py
+++ b/tensorflow/python/keras/saving/saved_model/revive_test.py
@@ -32,7 +32,6 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
@@ -121,12 +120,17 @@ class TestModelRevive(keras_parameterized.TestCase):
   def _assert_revived_correctness(self, model, revived):
     self.assertAllEqual(model.input_names, revived.input_names)
     self.assertAllEqual(model.output_names, revived.output_names)
-    self.assertTrue(all([
-        i.shape.as_list() == r.shape.as_list() and i.dtype == r.dtype
-        for (i, r) in zip(model.inputs, revived.inputs)]))
-    self.assertTrue(all([
-        i.shape.as_list() == r.shape.as_list() and i.dtype == r.dtype
-        for (i, r) in zip(model.outputs, revived.outputs)]))
+    if model.inputs is not None:
+      self.assertTrue(
+          all([
+              i.shape.as_list() == r.shape.as_list() and i.dtype == r.dtype
+              for (i, r) in zip(model.inputs, revived.inputs)
+          ]))
+      self.assertTrue(
+          all([
+              i.shape.as_list() == r.shape.as_list() and i.dtype == r.dtype
+              for (i, r) in zip(model.outputs, revived.outputs)
+          ]))
 
     self.assertAllClose(self.evaluate(model.weights),
                         self.evaluate(revived.weights))
@@ -205,9 +209,8 @@ class TestModelRevive(keras_parameterized.TestCase):
     model = testing_utils.get_model_from_layers(
         layers, input_shape=input_shape)
 
-    # The inputs attribute must be defined in order to save the model.
-    if not model.inputs:
-      model._set_inputs(tensor_spec.TensorSpec((None, 2, 3)))
+    # Run data through the Model to create save spec and weights.
+    model.predict(np.ones((10, 2, 3)), batch_size=10)
 
     # Test that the correct checkpointed values are loaded, whether the layer is
     # created from the config or SavedModel.
@@ -220,7 +223,8 @@ class TestModelRevive(keras_parameterized.TestCase):
 
   def test_revive_subclassed_with_nested_model(self):
     model = SubclassedModelNoConfig(1., 2.)
-    model._set_inputs(tensor_spec.TensorSpec((None, 2, 3)))
+    # Run data through the Model to create save spec and weights.
+    model.predict(np.ones((10, 2, 3)), batch_size=10)
     model.save(self.path, save_format='tf')
     revived = keras_load.load(self.path)
     self._assert_revived_correctness(model, revived)
diff --git a/tensorflow/python/keras/saving/saved_model/save_impl.py b/tensorflow/python/keras/saving/saved_model/save_impl.py
index 3fcc649cba5..7bd2b52fe84 100644
--- a/tensorflow/python/keras/saving/saved_model/save_impl.py
+++ b/tensorflow/python/keras/saving/saved_model/save_impl.py
@@ -67,28 +67,13 @@ sequential_lib = LazyLoader(
 
 def should_skip_serialization(layer):
   """Skip serializing extra objects and functions if layer inputs aren't set."""
-  if isinstance(layer, training_lib.Model):
-    try:
-      # pylint:disable=pointless-statement
-      layer.inputs
-      layer.input_names
-      # pylint:enable=pointless-statement
-    except AttributeError:
-      # If the model does not have inputs set, because it was not called or its
-      # input shapes were not recorded, we won't have a signature so can't trace
-      # a function. But the user may still save an object with this Model
-      # attached; we won't fail the whole tf.saved_model.save.
-      logging.warning('Skipping full serialization of Keras model {}, because '
-                      'its inputs are not defined.'.format(layer))
-      return True
-    else:
-      return False
-  else:
-    if not layer.built:
-      logging.warning('Skipping full serialization of Keras layer {}, because '
-                      'it is not built.'.format(layer))
-      return True
-    return False
+  saved_model_input_spec_set = (isinstance(layer, training_lib.Model) and
+                                layer._saved_model_inputs_spec is not None)  # pylint: disable=protected-access
+  if not layer.built and not saved_model_input_spec_set:
+    logging.warning('Skipping full serialization of Keras layer {}, because '
+                    'it is not built.'.format(layer))
+    return True
+  return False
 
 
 def wrap_layer_objects(layer, serialization_cache):
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index 018edc030e7..da86a7cdac1 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -85,17 +85,23 @@ class LayerWithLoss(keras.layers.Layer):
 
   def call(self, inputs):
     self.add_loss(math_ops.reduce_sum(inputs), inputs)
-    return inputs
+    return inputs * 2
 
 
 class LayerWithUpdate(keras.layers.Layer):
 
   def build(self, _):
-    self.v = self.add_weight('v', shape=[], dtype=dtypes.int32)
+    self.v = self.add_weight(
+        'v',
+        shape=[],
+        initializer=keras.initializers.zeros,
+        trainable=False,
+        dtype=dtypes.float32)
 
-  def call(self, inputs):
-    self.add_update(self.v.assign_add(math_ops.reduce_sum(inputs)))
-    return inputs
+  def call(self, inputs, training=True):
+    if training:
+      self.add_update(self.v.assign_add(1.))
+    return inputs * 2.
 
 
 @keras_parameterized.run_all_keras_modes
@@ -249,7 +255,7 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
     model.add_loss(eager_loss)
 
     # Call predict to ensure that all layers are built and inputs are set.
-    model.predict(np.random.random((1, 3)))
+    model.predict(np.random.random((1, 3)).astype(np.float32))
     saved_model_dir = self._save_model_dir()
 
     tf_save.save(model, saved_model_dir)
@@ -608,13 +614,13 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
 
   def _testAddUpdate(self, scope):
     with scope:
-      layer_with_update = LayerWithUpdate(dtype=dtypes.int32)
+      layer_with_update = LayerWithUpdate()
       model = testing_utils.get_model_from_layers([layer_with_update],
-                                                  input_shape=(3,),
-                                                  input_dtype=dtypes.int32)
+                                                  input_shape=(3,))
 
+      x = np.ones((10, 3))
       if testing_utils.get_model_type() == 'subclass':
-        model._set_inputs(constant_op.constant([[1, 2, 3]], dtype=dtypes.int32))
+        model.predict(x, batch_size=10)
       self.evaluate(variables.variables_initializer(model.variables))
       saved_model_dir = self._save_model_dir()
       model.save(saved_model_dir, save_format='tf')
@@ -622,11 +628,11 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
     loaded = keras_load.load(saved_model_dir)
     loaded_layer = loaded.layers[-1]
     self.evaluate(variables.variables_initializer(loaded.variables))
-    self.assertEqual(self.evaluate(loaded_layer.v), 0)
+    self.assertEqual(self.evaluate(loaded_layer.v), 0.)
 
-    loaded.predict(constant_op.constant([[1, 2, 3]], dtype=dtypes.int32),
-                   steps=1)
-    self.assertEqual(self.evaluate(loaded_layer.v), 6)
+    loaded.compile('sgd', 'mse')
+    loaded.fit(x, x, batch_size=10)
+    self.assertEqual(self.evaluate(loaded_layer.v), 1.)
 
   @keras_parameterized.run_with_all_model_types
   def testSaveLayerWithUpdates(self):
diff --git a/tensorflow/python/keras/saving/saved_model_experimental_test.py b/tensorflow/python/keras/saving/saved_model_experimental_test.py
index 11a3ff5e1ab..2f3cf7cf9c9 100644
--- a/tensorflow/python/keras/saving/saved_model_experimental_test.py
+++ b/tensorflow/python/keras/saving/saved_model_experimental_test.py
@@ -32,8 +32,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import training as model_lib
 from tensorflow.python.keras.optimizer_v2 import adadelta
 from tensorflow.python.keras.optimizer_v2 import rmsprop
@@ -47,7 +45,7 @@ from tensorflow.python.saved_model import model_utils
 from tensorflow.python.training import training as training_module
 
 
-@keras_parameterized.run_all_keras_modes()
+@test_util.run_deprecated_v1  # Removed in v2.
 class TestModelSavingandLoading(parameterized.TestCase, test.TestCase):
 
   def _save_model_dir(self, dirname='saved_model'):
@@ -65,9 +63,7 @@ class TestModelSavingandLoading(parameterized.TestCase, test.TestCase):
           loss=keras.losses.MSE,
           optimizer=rmsprop.RMSprop(lr=0.0001),
           metrics=[keras.metrics.categorical_accuracy],
-          sample_weight_mode='temporal',
-          run_eagerly=testing_utils.should_run_eagerly(),
-          experimental_run_tf_function=testing_utils.should_run_tf_function())
+          sample_weight_mode='temporal')
       x = np.random.random((1, 3))
       y = np.random.random((1, 3, 3))
       model.train_on_batch(x, y)
@@ -81,7 +77,6 @@ class TestModelSavingandLoading(parameterized.TestCase, test.TestCase):
       y = loaded_model.predict(x)
       self.assertAllClose(ref_y, y, atol=1e-05)
 
-  @test_util.run_in_graph_and_eager_modes
   def test_saving_sequential_model_without_compile(self):
     with self.cached_session():
       model = keras.models.Sequential()
@@ -109,9 +104,7 @@ class TestModelSavingandLoading(parameterized.TestCase, test.TestCase):
       model.compile(
           loss=keras.losses.MSE,
           optimizer=rmsprop.RMSprop(lr=0.0001),
-          metrics=[keras.metrics.categorical_accuracy],
-          run_eagerly=testing_utils.should_run_eagerly(),
-          experimental_run_tf_function=testing_utils.should_run_tf_function())
+          metrics=[keras.metrics.categorical_accuracy])
       x = np.random.random((1, 3))
       y = np.random.random((1, 3))
       model.train_on_batch(x, y)
@@ -125,7 +118,6 @@ class TestModelSavingandLoading(parameterized.TestCase, test.TestCase):
       y = loaded_model.predict(x)
       self.assertAllClose(ref_y, y, atol=1e-05)
 
-  @test_util.run_in_graph_and_eager_modes
   def test_saving_functional_model_without_compile(self):
     with self.cached_session():
       inputs = keras.layers.Input(shape=(3,))
@@ -146,7 +138,6 @@ class TestModelSavingandLoading(parameterized.TestCase, test.TestCase):
       y = loaded_model.predict(x)
       self.assertAllClose(ref_y, y, atol=1e-05)
 
-  @test_util.run_in_graph_and_eager_modes
   def test_saving_with_tf_optimizer(self):
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(2, input_shape=(3,)))
@@ -167,9 +158,7 @@ class TestModelSavingandLoading(parameterized.TestCase, test.TestCase):
     loaded_model.compile(
         loss='mse',
         optimizer=training_module.RMSPropOptimizer(0.1),
-        metrics=['acc'],
-        run_eagerly=testing_utils.should_run_eagerly(),
-        experimental_run_tf_function=testing_utils.should_run_tf_function())
+        metrics=['acc'])
     y = loaded_model.predict(x)
     self.assertAllClose(ref_y, y, atol=1e-05)
 
@@ -290,7 +279,7 @@ def load_model(sess, path, mode):
   return inputs, outputs, meta_graph_def
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@test_util.run_deprecated_v1  # Removed in v2.
 class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
 
   def _save_model_dir(self, dirname='saved_model'):
diff --git a/tensorflow/python/keras/saving/saving_utils.py b/tensorflow/python/keras/saving/saving_utils.py
index fe8d26485b9..9a82f69a2fd 100644
--- a/tensorflow/python/keras/saving/saving_utils.py
+++ b/tensorflow/python/keras/saving/saving_utils.py
@@ -19,13 +19,14 @@ from __future__ import print_function
 
 import collections
 import os
+import six
 
 from tensorflow.python.eager import def_function
-from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import losses
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.engine import base_layer_utils
+from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
@@ -43,13 +44,12 @@ def extract_model_metrics(model):
     Dictionary mapping metric names to metric instances. May return `None` if
     the model does not contain any metrics.
   """
-  if not getattr(model, '_compile_metrics', None):
-    return None
-
-  # TODO(psv/kathywu): use this implementation in model to estimator flow.
-  # We are not using model.metrics here because we want to exclude the metrics
-  # added using `add_metric` API.
-  return {m.name: m for m in model._compile_metric_functions}  # pylint: disable=protected-access
+  if getattr(model, '_compile_metrics', None):
+    # TODO(psv/kathywu): use this implementation in model to estimator flow.
+    # We are not using model.metrics here because we want to exclude the metrics
+    # added using `add_metric` API.
+    return {m.name: m for m in model._compile_metric_functions}  # pylint: disable=protected-access
+  return None
 
 
 def model_input_signature(model, keep_original_batch_size=False):
@@ -73,29 +73,9 @@ def model_input_signature(model, keep_original_batch_size=False):
     A list containing either a single TensorSpec or an object with nested
     TensorSpecs. This list does not contain the `training` argument.
   """
-  try:
-    inputs = model.inputs
-    input_names = model.input_names
-  except AttributeError:
+  input_specs = model._get_save_spec(dynamic_batch=not keep_original_batch_size)  # pylint: disable=protected-access
+  if input_specs is None:
     return None
-  flat_inputs = nest.flatten(inputs)
-  flat_input_names = nest.flatten(input_names)
-  flat_input_specs = []
-  for input_tensor, input_name in zip(flat_inputs, flat_input_names):
-    if keep_original_batch_size:
-      input_shape = input_tensor.shape.as_list()
-    else:
-      # If the user has not explicitly provided the input_signature, we
-      # create it from the inputs. We make sure to set the first dimension
-      # (batch) to None here, as in serving or retraining, batch should not
-      # be fixed. See b/132783590 for context.
-      input_shape = [None] + input_tensor.shape[1:].as_list()
-    flat_input_specs.append(tensor_spec.TensorSpec(
-        shape=input_shape, dtype=input_tensor.dtype,
-        name=input_name))
-  input_specs = nest.pack_sequence_as(structure=inputs,
-                                      flat_sequence=flat_input_specs)
-
   # Return a list with a single element as the model's input signature.
   if isinstance(input_specs, collections.Sequence) and len(input_specs) == 1:
     # Note that the isinstance check filters out single-element dictionaries,
@@ -147,14 +127,15 @@ def trace_model_call(model, input_signature=None):
 
     with base_layer_utils.call_context().enter(
         model, inputs=inputs, build_graph=False, training=False, saving=True):
-      outputs_list = nest.flatten(model(inputs, training=False))
+      outputs = model(inputs, training=False)
 
-    try:
-      output_names = model.output_names
-    except AttributeError:
-      from tensorflow.python.keras.engine import training_utils  # pylint: disable=g-import-not-at-top
-      output_names = training_utils.generic_output_names(outputs_list)
-    return {name: output for name, output in zip(output_names, outputs_list)}
+    # Outputs always has to be a flat dict.
+    output_names = model.output_names  # Functional Model.
+    if output_names is None:  # Subclassed Model.
+      from tensorflow.python.keras.engine import compile_utils  # pylint: disable=g-import-not-at-top
+      output_names = compile_utils.create_pseudo_output_names(outputs)
+    outputs = nest.flatten(outputs)
+    return {name: output for name, output in zip(output_names, outputs)}
 
   return _wrapped_model
 
@@ -187,32 +168,22 @@ def model_metadata(model, include_optimizer=True, require_config=True):
           'You will have to compile your model again after loading it. '
           'Prefer using a Keras optimizer instead '
           '(see keras.io/optimizers).')
-    else:
-      try:
-        metadata['training_config'] = {
-            'loss': model.loss,
-            # pylint: disable=protected-access
-            'metrics': model._compile_metrics,
-            'weighted_metrics': model._compile_weighted_metrics,
-            # pylint: enable=protected-access
-            'sample_weight_mode': model.sample_weight_mode,
-            'loss_weights': model.loss_weights,
+    elif model._compile_was_called:  # pylint: disable=protected-access
+      training_config = model._get_compile_args()  # pylint: disable=protected-access
+      training_config.pop('optimizer', None)  # Handled separately.
+      metadata['training_config'] = _serialize_nested_config(training_config)
+      if isinstance(model.optimizer, optimizer_v2.RestoredOptimizer):
+        raise NotImplementedError(
+            'As of now, Optimizers loaded from SavedModel cannot be saved. '
+            'If you\'re calling `model.save` or `tf.keras.models.save_model`,'
+            ' please set the `include_optimizer` option to `False`. For '
+            '`tf.saved_model.save`, delete the optimizer from the model.')
+      else:
+        optimizer_config = {
+            'class_name': model.optimizer.__class__.__name__,
+            'config': model.optimizer.get_config()
         }
-        if isinstance(model.optimizer, optimizer_v2.RestoredOptimizer):
-          raise NotImplementedError(
-              'As of now, Optimizers loaded from SavedModel cannot be saved. '
-              'If you\'re calling `model.save` or `tf.keras.models.save_model`,'
-              ' please set the `include_optimizer` option to `False`. For '
-              '`tf.saved_model.save`, delete the optimizer from the model.')
-        else:
-          optimizer_config = {
-              'class_name': model.optimizer.__class__.__name__,
-              'config': model.optimizer.get_config()}
-        metadata['training_config']['optimizer_config'] = optimizer_config
-      except AttributeError:
-        pass  # If the model has an optimizer, but not all of the attributes
-        # loss, _compile_metrics, etc., then it was not compiled using
-        # model.compile. In this case, do not save the training config.
+      metadata['training_config']['optimizer_config'] = optimizer_config
   return metadata
 
 
@@ -224,73 +195,36 @@ def should_overwrite(filepath, overwrite):
   return True
 
 
-def convert_output_metrics(metrics_config, custom_objects):
-  from tensorflow.python.keras import metrics as metrics_module  # pylint:disable=g-import-not-at-top
-  if isinstance(metrics_config, list):
-    return [convert_output_metrics(mc, custom_objects) for mc in metrics_config]
-  elif (isinstance(metrics_config, dict) or
-        (metrics_config not in ['accuracy', 'acc', 'crossentropy', 'ce'])):
-    # Do not deserialize accuracy and cross-entropy strings as we have special
-    # case handling for these in compile, based on model output shape.
-    return metrics_module.deserialize(metrics_config, custom_objects)
-  return metrics_config
-
-
 def compile_args_from_training_config(training_config, custom_objects=None):
   """Return model.compile arguments from training config."""
   if custom_objects is None:
     custom_objects = {}
 
-  optimizer_config = training_config['optimizer_config']
-  optimizer = optimizers.deserialize(
-      optimizer_config, custom_objects=custom_objects)
+  with generic_utils.CustomObjectScope(custom_objects):
+    optimizer_config = training_config['optimizer_config']
+    optimizer = optimizers.deserialize(optimizer_config)
 
-  # Recover losses.
-  loss_config = training_config['loss']
-  if isinstance(loss_config, list):  # Loss fed to compile as a list.
-    loss = [losses.deserialize(lc, custom_objects) for lc in loss_config]
-  elif isinstance(loss_config, dict) and 'class_name' not in loss_config:
-    # Loss fed to compile as a dict.
-    loss = {
-        k: losses.deserialize(v, custom_objects)
-        for (k, v) in loss_config.items()
-    }
-  else:  # Loss fed to compile as a str/ function/ class instance.
-    loss = losses.deserialize(loss_config, custom_objects)
+    # Recover losses.
+    loss = None
+    loss_config = training_config.get('loss', None)
+    if loss_config is not None:
+      loss = _deserialize_nested_config(losses.deserialize, loss_config)
 
-  # Recover metrics.
-  metrics_config = training_config.get('metrics', None)
-  if isinstance(metrics_config, dict):  # Metrics fed to compile as a dict.
-    metrics = {
-        k: convert_output_metrics(v, custom_objects)
-        for (k, v) in metrics_config.items()
-    }
-  elif isinstance(metrics_config, list):  # Metrics fed to compile as a list.
-    metrics = [
-        convert_output_metrics(m, custom_objects) for m in metrics_config
-    ]
-  else:  # No metrics.
+    # Recover metrics.
     metrics = None
+    metrics_config = training_config.get('metrics', None)
+    if metrics_config is not None:
+      metrics = _deserialize_nested_config(_deserialize_metric, metrics_config)
 
-  # Recover weighted metrics.
-  weighted_metrics_config = training_config.get('weighted_metrics', None)
-  if isinstance(weighted_metrics_config, dict):
-    # Metrics fed to compile as a dict.
-    weighted_metrics = {
-        k: convert_output_metrics(v, custom_objects)
-        for (k, v) in weighted_metrics_config.items()
-    }
-  elif isinstance(weighted_metrics_config, list):
-    # Metrics fed to compile as a list.
-    weighted_metrics = [
-        convert_output_metrics(m, custom_objects)
-        for m in weighted_metrics_config
-    ]
-  else:  # No metrics.
+    # Recover weighted metrics.
     weighted_metrics = None
+    weighted_metrics_config = training_config.get('weighted_metrics', None)
+    if weighted_metrics_config is not None:
+      weighted_metrics = _deserialize_nested_config(_deserialize_metric,
+                                                    weighted_metrics_config)
 
-  sample_weight_mode = training_config['sample_weight_mode']
-  loss_weights = training_config['loss_weights']
+    sample_weight_mode = training_config['sample_weight_mode']
+    loss_weights = training_config['loss_weights']
 
   return dict(
       optimizer=optimizer,
@@ -299,3 +233,49 @@ def compile_args_from_training_config(training_config, custom_objects=None):
       weighted_metrics=weighted_metrics,
       loss_weights=loss_weights,
       sample_weight_mode=sample_weight_mode)
+
+
+def _deserialize_nested_config(deserialize_fn, config):
+  """Deserializes arbitrary Keras `config` using `deserialize_fn`."""
+
+  def _is_single_object(obj):
+    if isinstance(obj, dict) and 'class_name' in obj:
+      return True  # Serialized Keras object.
+    if isinstance(obj, six.string_types):
+      return True  # Serialized function or string.
+    return False
+
+  if config is None:
+    return None
+  if _is_single_object(config):
+    return deserialize_fn(config)
+  elif isinstance(config, dict):
+    return {
+        k: _deserialize_nested_config(deserialize_fn, v)
+        for k, v in config.items()
+    }
+  elif isinstance(config, (tuple, list)):
+    return [_deserialize_nested_config(deserialize_fn, obj) for obj in config]
+
+  raise ValueError('Saved configuration not understood.')
+
+
+def _serialize_nested_config(config):
+  """Serialized a nested structure of Keras objects."""
+
+  def _serialize_fn(obj):
+    if callable(obj):
+      return generic_utils.serialize_keras_object(obj)
+    return obj
+
+  return nest.map_structure(_serialize_fn, config)
+
+
+def _deserialize_metric(metric_config):
+  """Deserialize metrics, leaving special strings untouched."""
+  from tensorflow.python.keras import metrics as metrics_module  # pylint:disable=g-import-not-at-top
+  if metric_config in ['accuracy', 'acc', 'crossentropy', 'ce']:
+    # Do not deserialize accuracy and cross-entropy strings as we have special
+    # case handling for these in compile, based on model output shape.
+    return metric_config
+  return metrics_module.deserialize(metric_config)
diff --git a/tensorflow/python/keras/saving/saving_utils_test.py b/tensorflow/python/keras/saving/saving_utils_test.py
index 92bee3df50a..4687e8a617a 100644
--- a/tensorflow/python/keras/saving/saving_utils_test.py
+++ b/tensorflow/python/keras/saving/saving_utils_test.py
@@ -76,7 +76,10 @@ class TraceModelCallTest(keras_parameterized.TestCase):
 
     fn = saving_utils.trace_model_call(model)
     signature_outputs = fn(inputs)
-    expected_outputs = {model.output_names[0]: model(inputs)}
+    if model.output_names:
+      expected_outputs = {model.output_names[0]: model(inputs)}
+    else:
+      expected_outputs = {'output_1': model(inputs)}
 
     self._assert_all_close(expected_outputs, signature_outputs)
 
@@ -90,14 +93,19 @@ class TraceModelCallTest(keras_parameterized.TestCase):
         loss='mse',
         run_eagerly=testing_utils.should_run_eagerly(),
         experimental_run_tf_function=testing_utils.should_run_tf_function())
-    model.fit(x=np.random.random((8, 5)),
-              y=np.random.random((8, 3)), epochs=2)
+    model.fit(
+        x=np.random.random((8, 5)).astype(np.float32),
+        y=np.random.random((8, 3)).astype(np.float32),
+        epochs=2)
 
     inputs = array_ops.ones((8, 5))
 
     fn = saving_utils.trace_model_call(model)
     signature_outputs = fn(inputs)
-    expected_outputs = {model.output_names[0]: model(inputs)}
+    if model.output_names:
+      expected_outputs = {model.output_names[0]: model(inputs)}
+    else:
+      expected_outputs = {'output_1': model(inputs)}
 
     self._assert_all_close(expected_outputs, signature_outputs)
 
@@ -140,9 +148,13 @@ class TraceModelCallTest(keras_parameterized.TestCase):
     fn = saving_utils.trace_model_call(model)
     signature_outputs = fn([input_a_np, input_b_np])
     outputs = model([input_a_np, input_b_np])
-    expected_outputs = {model.output_names[0]: outputs[0],
-                        model.output_names[1]: outputs[1]}
-
+    if model.output_names:
+      expected_outputs = {
+          model.output_names[0]: outputs[0],
+          model.output_names[1]: outputs[1]
+      }
+    else:
+      expected_outputs = {'output_1': outputs[0], 'output_2': outputs[1]}
     self._assert_all_close(expected_outputs, signature_outputs)
 
   @test_util.run_in_graph_and_eager_modes
@@ -177,7 +189,10 @@ class TraceModelCallTest(keras_parameterized.TestCase):
     fn = saving_utils.trace_model_call(
         model, [tensor_spec.TensorSpec(shape=[None, 5], dtype=dtypes.float32)])
     signature_outputs = fn(inputs)
-    expected_outputs = {model.output_names[0]: model(inputs)}
+    if model.output_names:
+      expected_outputs = {model.output_names[0]: model(inputs)}
+    else:
+      expected_outputs = {'output_1': model(inputs)}
     self._assert_all_close(expected_outputs, signature_outputs)
 
   @test_util.run_in_graph_and_eager_modes
@@ -242,7 +257,9 @@ def _import_and_infer(save_dir, inputs):
     model = loader.load(session, [tag_constants.SERVING], save_dir)
     signature = model.signature_def[
         signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
-    assert set(inputs.keys()) == set(signature.inputs.keys())
+    assert set(inputs.keys()) == set(
+        signature.inputs.keys()), ('expected {}, found {}'.format(
+            signature.inputs.keys(), inputs.keys()))
     feed_dict = {}
     for arg_name in inputs.keys():
       feed_dict[graph.get_tensor_by_name(signature.inputs[arg_name].name)] = (
@@ -254,10 +271,10 @@ def _import_and_infer(save_dir, inputs):
     return session.run(output_dict, feed_dict=feed_dict)
 
 
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
 class ModelSaveTest(keras_parameterized.TestCase):
 
-  @keras_parameterized.run_with_all_model_types
-  @test_util.run_v2_only
   def test_model_save(self):
     input_dim = 5
     model = testing_utils.get_small_mlp(10, 3, input_dim)
@@ -269,14 +286,21 @@ class ModelSaveTest(keras_parameterized.TestCase):
     save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
     save_lib.save(model, save_dir)
 
-    self.assertAllClose(
-        {model.output_names[0]: model.predict_on_batch(inputs)},
-        _import_and_infer(save_dir, {model.input_names[0]: np.ones((8, 5))}))
+    if model.output_names:
+      output_name = model.output_names[0]
+      input_name = model.input_names[0]
+    else:
+      output_name = 'output_1'
+      input_name = 'input_1'
+
+    self.assertAllClose({output_name: model.predict_on_batch(inputs)},
+                        _import_and_infer(save_dir,
+                                          {input_name: np.ones((8, 5))}))
 
 
+@test_util.run_deprecated_v1  # Not used in v2.
 class ExtractModelMetricsTest(keras_parameterized.TestCase):
 
-  @keras_parameterized.run_all_keras_modes
   def test_extract_model_metrics(self):
     a = keras.layers.Input(shape=(3,), name='input_a')
     b = keras.layers.Input(shape=(3,), name='input_b')
@@ -308,9 +332,7 @@ class ExtractModelMetricsTest(keras_parameterized.TestCase):
             keras.metrics.BinaryAccuracy(), 'mae',
             keras.metrics.mean_squared_error
         ],
-        optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01),
-        run_eagerly=testing_utils.should_run_eagerly(),
-        experimental_run_tf_function=testing_utils.should_run_tf_function())
+        optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01))
     extract_metrics = saving_utils.extract_model_metrics(model)
     self.assertEqual(set(model_metric_names), set(model.metrics_names))
     self.assertEqual(set(extract_metric_names), set(extract_metrics.keys()))
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index a3867927e70..564c1d07fe2 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -632,6 +632,9 @@ class _MultiIOSubclassModel(keras.Model):
         inputs = layer(inputs)
       a = inputs
       b = inputs
+    elif isinstance(inputs, dict):
+      a = inputs['input_1']
+      b = inputs['input_2']
     else:
       a, b = inputs
 
diff --git a/tensorflow/python/keras/tests/model_subclassing_compiled_test.py b/tensorflow/python/keras/tests/model_subclassing_compiled_test.py
index 404c9f0c975..aa94f8400e0 100644
--- a/tensorflow/python/keras/tests/model_subclassing_compiled_test.py
+++ b/tensorflow/python/keras/tests/model_subclassing_compiled_test.py
@@ -134,8 +134,6 @@ class ModelSubclassCompiledTest(keras_parameterized.TestCase):
     self.assertEqual(len(model.weights), 10)
     self.assertEqual(len(model.trainable_weights), 8)
     self.assertEqual(len(model.non_trainable_weights), 2)
-    self.assertEqual(len(model.inputs), 2)
-    self.assertEqual(len(model.outputs), 2)
 
   def test_updates(self):
     # test that updates get run during training
diff --git a/tensorflow/python/keras/tests/model_subclassing_test.py b/tensorflow/python/keras/tests/model_subclassing_test.py
index 56cdbb17d27..d3b601e75ed 100644
--- a/tensorflow/python/keras/tests/model_subclassing_test.py
+++ b/tensorflow/python/keras/tests/model_subclassing_test.py
@@ -340,7 +340,7 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
     # Single-io
     model = testing_utils.SmallSubclassMLP(
         num_hidden=32, num_classes=4, use_bn=True, use_dp=True)
-    model._set_inputs(np.ones((3, 4)))  # need to build model first
+    model(np.ones((3, 4)))  # need to build model first
     print_fn = ToString()
     model.summary(print_fn=print_fn)
     self.assertTrue('Trainable params: 356' in print_fn.contents)
@@ -348,8 +348,7 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
     # Multi-io
     model = model_util.get_multi_io_subclass_model(
         num_classes=(5, 6), use_bn=True, use_dp=True)
-    model._set_inputs([np.ones((3, 4)),
-                       np.ones((3, 4))])  # need to build model first
+    model([np.ones((3, 4)), np.ones((3, 4))])  # need to build model first
     print_fn = ToString()
     model.summary(print_fn=print_fn)
     self.assertTrue('Trainable params: 587' in print_fn.contents)
@@ -677,6 +676,8 @@ class CustomCallSignatureTests(test.TestCase):
   @test_util.assert_no_new_tensors
   @test_util.assert_no_garbage_created
   def test_training_no_default(self):
+    if not context.executing_eagerly():
+      return
     model = model_util.TrainingNoDefaultModel()
     arg = array_ops.ones([1, 1])
     model(arg, True)
diff --git a/tensorflow/python/keras/tests/temporal_sample_weights_correctness_test.py b/tensorflow/python/keras/tests/temporal_sample_weights_correctness_test.py
index 0d9f77cb000..8854783ea05 100644
--- a/tensorflow/python/keras/tests/temporal_sample_weights_correctness_test.py
+++ b/tensorflow/python/keras/tests/temporal_sample_weights_correctness_test.py
@@ -20,13 +20,13 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python import tf2
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import metrics
 from tensorflow.python.keras import optimizer_v2
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
+from tensorflow.python.util import nest
 
 
 class Bias(layers.Layer):
@@ -102,7 +102,7 @@ def run_with_different_sample_weight_mode_inputs(fn, partial_sw=True):
 
 
 @keras_parameterized.run_with_all_model_types(exclude_models=['sequential'])
-@keras_parameterized.run_all_keras_modes
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
 class TestMetricsCorrectnessMultiIOTemporal(keras_parameterized.TestCase):
 
   def custom_generator_multi_io_temporal(self, sample_weights=None):
@@ -116,13 +116,6 @@ class TestMetricsCorrectnessMultiIOTemporal(keras_parameterized.TestCase):
     """
     batch_size = 3
     num_samples = 3
-    if sample_weights:
-      assert len(sample_weights) == 2
-      w1 = sample_weights[0]
-      w2 = sample_weights[1]
-    else:
-      w1 = None
-      w2 = None
     iteration = 0
     while True:
       batch_index = iteration * batch_size % num_samples
@@ -132,13 +125,10 @@ class TestMetricsCorrectnessMultiIOTemporal(keras_parameterized.TestCase):
       x = [self.x[start:end], self.x[start:end]]
       y = [self.y1[start:end], self.y2[start:end]]
       if sample_weights:
-        w = [
-            None if w1 is None else w1[start:end],
-            None if w2 is None else w2[start:end]
-        ]
+        sw = nest.map_structure(lambda w: w[start:end], sample_weights)
       else:
-        w = None
-      yield x, y, w
+        sw = None
+      yield x, y, sw
 
   def setUp(self):
     super(TestMetricsCorrectnessMultiIOTemporal, self).setUp()
@@ -147,11 +137,6 @@ class TestMetricsCorrectnessMultiIOTemporal(keras_parameterized.TestCase):
     self.y1 = np.asarray([[[.5], [1.]], [[2.], [2.5]], [[3.5], [2.5]]])
     self.y2 = np.asarray([[[.5], [1.5]], [[2.], [1.5]], [[3.5], [3.]]])
 
-    if tf2.enabled():
-      self.wmae = 'mae_2'
-    else:
-      self.wmae = 'weighted_mae_2'
-
     # Without weights:
     # Epoch 1 - bias = 0
     #   y_pred_1 = [[[0.], [0.]], [[1.], [1.]], [[2.], [2.]]]
@@ -172,8 +157,8 @@ class TestMetricsCorrectnessMultiIOTemporal(keras_parameterized.TestCase):
     self.expected_fit_result = {
         'output_1_mae': [1, 0.9],
         'output_2_mae': [1, 0.9],
-        'output_1_' + self.wmae: [1, 0.9],
-        'output_2_' + self.wmae: [1, 0.9],
+        'output_1_mae_2': [1, 0.9],
+        'output_2_mae_2': [1, 0.9],
         'loss': [2., 1.8],
         'output_1_loss': [1, 0.9],
         'output_2_loss': [1, 0.9],
@@ -229,8 +214,8 @@ class TestMetricsCorrectnessMultiIOTemporal(keras_parameterized.TestCase):
     self.expected_fit_result_with_weights = {
         'output_1_mae': [1, 0.875],
         'output_2_mae': [1, 0.875],
-        'output_1_' + self.wmae: [1, 0.875],
-        'output_2_' + self.wmae: [1, 0.875],
+        'output_1_mae_2': [1, 0.875],
+        'output_2_mae_2': [1, 0.875],
         'loss': [2.5, 2.1875],
         'output_1_loss': [1.25, 1.09375],
         'output_2_loss': [1.25, 1.09375],
@@ -239,8 +224,8 @@ class TestMetricsCorrectnessMultiIOTemporal(keras_parameterized.TestCase):
     self.expected_fit_result_with_weights_output_2 = {
         'output_1_mae': [1., 0.9],
         'output_2_mae': [1, 0.875],
-        'output_1_' + self.wmae: [1., 0.9],
-        'output_2_' + self.wmae: [1., 0.875],
+        'output_1_mae_2': [1., 0.9],
+        'output_2_mae_2': [1., 0.875],
         'loss': [2.25, 1.99375],
         'output_1_loss': [1., 0.9],
         'output_2_loss': [1.25, 1.09375],
@@ -461,7 +446,7 @@ class TestMetricsCorrectnessMultiIOTemporal(keras_parameterized.TestCase):
     def _train_and_assert(model):
       history = model.fit_generator(
           self.custom_generator_multi_io_temporal(
-              sample_weights=[None, self.sample_weight_2]),
+              sample_weights={'output_2': self.sample_weight_2}),
           steps_per_epoch=1,
           epochs=2)
       for key, value in self.expected_fit_result_with_weights_output_2.items():
@@ -506,7 +491,7 @@ class TestMetricsCorrectnessMultiIOTemporal(keras_parameterized.TestCase):
                            })
       eval_result = model.evaluate_generator(
           self.custom_generator_multi_io_temporal(
-              sample_weights=[None, self.sample_weight_2]),
+              sample_weights={'output_2': self.sample_weight_2}),
           steps=2)
       self.assertAllClose(eval_result,
                           self.expected_batch_result_with_weights_output_2,
@@ -517,9 +502,7 @@ class TestMetricsCorrectnessMultiIOTemporal(keras_parameterized.TestCase):
   def test_error_on_fit_with_class_weight(self):
 
     def _train_and_assert(model):
-      with self.assertRaisesRegex(
-          ValueError,
-          r'`class_weight` not supported for 3\+ dimensional targets.'):
+      with self.assertRaises(ValueError):
         model.fit([self.x, self.x], [self.y1, self.y2],
                   class_weight={'output_1': {
                       .5: .5,
diff --git a/tensorflow/python/keras/utils/composite_tensor_support_test.py b/tensorflow/python/keras/utils/composite_tensor_support_test.py
index 87e70a239ce..13af9590e80 100644
--- a/tensorflow/python/keras/utils/composite_tensor_support_test.py
+++ b/tensorflow/python/keras/utils/composite_tensor_support_test.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
+from tensorflow.python.util import nest
 
 
 # Define test-only Layer classes to validate passing Sparse and Ragged tensors
@@ -57,6 +58,10 @@ class ToDense(Layer):
     self._supports_ragged_inputs = True
 
   def call(self, inputs):
+    if isinstance(inputs, dict):  # Dicts are no longer flattened.
+      # Always a single element in these tests.
+      inputs = nest.flatten(inputs)[0]
+
     if isinstance(inputs, ragged_tensor.RaggedTensor):
       output = inputs.to_tensor(default_value=self._default_value)
     elif isinstance(inputs, sparse_tensor.SparseTensor):
@@ -610,80 +615,6 @@ class RaggedTensorInputValidationTest(keras_parameterized.TestCase,
       result = model.predict(input_data, **kwargs)
       self.assertAllEqual(expected_output, result)
 
-  def test_ragged_tensor_input_with_wrong_ragged_rank_fails(
-      self, use_dict, use_dataset):
-    # Define some input data that will NOT match the input shape spec.
-    data = [(ragged_factory_ops.constant([[[1, 0]], [[2, 3]]]), None)]
-
-    # Prepare the model to test.
-    input_shape = (None, 2)  # RaggedTensorInputTest uses (None, None).
-    input_name = get_input_name(use_dict)
-    model_input = input_layer.Input(
-        shape=input_shape, ragged=True, name=input_name, dtype=dtypes.int32)
-    layers = [ToDense(default_value=-1)]
-    model = get_model_from_layers_with_input(layers, model_input=model_input)
-    model.compile(
-        optimizer="sgd",
-        loss="mse",
-        metrics=["accuracy"],
-        **get_test_mode_kwargs())
-
-    # Define some input data with the wrong ragged rank
-    for data_element in data:
-      input_data, _ = prepare_inputs(
-          data_element,
-          use_dict,
-          use_dataset,
-          action="predict",
-          input_name=input_name)
-      with self.assertRaisesRegex(ValueError, ".*don't have the same nested.*"):
-        _ = model.predict(input_data)
-
-
-# CompositeTensor shape validation only happens in non-eager modes and in non-
-# subclassed models, so we run a separate parameterized test for them.
-@keras_parameterized.run_with_all_model_types(exclude_models=["subclass"])
-@keras_parameterized.run_all_keras_modes(always_skip_eager=True)
-class SparseTensorInputValidationTest(keras_parameterized.TestCase):
-
-  def test_sparse_scipy_input_checks_shape(self):
-    model_input = input_layer.Input(shape=(3,), sparse=True, dtype=dtypes.int32)
-    layers = [ToDense(default_value=-1)]
-    model = get_model_from_layers_with_input(layers, model_input=model_input)
-
-    input_data = scipy.sparse.coo_matrix(([1, 2, 3], ([0, 1, 1], [0, 0, 1])),
-                                         shape=[2, 4])
-    with self.assertRaisesRegex(ValueError, ".*got array with shape.*"):
-      _ = model.predict(input_data)
-
-  def test_sparse_tensor_input_checks_shapes(self):
-    # Create a model that accepts a sparse input and converts the sparse tensor
-    # back to a dense tensor.
-    model_input = input_layer.Input(
-        shape=(2, None), sparse=True, dtype=dtypes.int32)
-    layers = [ToDense(default_value=-1)]
-    model = get_model_from_layers_with_input(layers, model_input=model_input)
-
-    # Define some input data.
-    input_data = sparse_tensor.SparseTensor([[0, 0, 0], [1, 0, 0], [1, 0, 1]],
-                                            [1, 2, 3], [2, 1, 3])
-    kwargs = get_kwargs(use_dataset=False)
-    with self.assertRaisesRegex(ValueError, ".*got array with shape.*"):
-      _ = model.predict(input_data, **kwargs)
-
-  def test_ragged_tensor_input_with_wrong_value_shape(self):
-    # Create a model that accepts a ragged input and converts it to dense.
-    model_input = input_layer.Input(
-        shape=(None, 4), ragged=True, dtype=dtypes.int32)
-    layers = [ToDense(default_value=-1)]
-    model = get_model_from_layers_with_input(layers, model_input=model_input)
-
-    # Define some input data with the wrong ragged rank
-    input_data = ragged_factory_ops.constant([[[1, 0]], [[2, 3]]],
-                                             ragged_rank=1)
-    with self.assertRaisesRegex(ValueError, ".*got array with shape.*"):
-      _ = model.predict(input_data)
-
 
 @keras_parameterized.run_with_all_model_types()
 @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
@@ -707,7 +638,7 @@ class CompositeTensorModelPredictTest(keras_parameterized.TestCase):
     sparse_input = sparse_tensor.SparseTensor(
         # A two-row matrix
         indices=[(0, 0), (0, 1), (0, 2), (5, 0), (5, 1), (5, 2)],
-        values=[1, 1, 1, 1, 1, 1],
+        values=[1., 1., 1., 1., 1., 1.],
         dense_shape=(6, 3))
 
     shape = model(sparse_input).shape
@@ -736,37 +667,5 @@ class CompositeTensorModelPredictTest(keras_parameterized.TestCase):
     self.assertEqual((2, None, 5), self._normalize_shape(shape))
 
 
-@keras_parameterized.run_with_all_model_types(
-    exclude_models=["functional"])
-@keras_parameterized.run_all_keras_modes
-class UndefinedCompositeTensorInputsTest(keras_parameterized.TestCase):
-
-  def test_subclass_implicit_sparse_inputs_fails(self):
-    # Create a model that accepts a sparse input and converts the sparse tensor
-    # back to a dense tensor.
-    layers = [ToDense(default_value=-1)]
-    model = testing_utils.get_model_from_layers(layers)
-
-    # Define some input data.
-    input_data = sparse_tensor.SparseTensor([[0, 0], [1, 0], [1, 1]], [1, 2, 3],
-                                            [2, 3])
-    kwargs = get_kwargs(False)
-    with self.assertRaisesRegex(
-        ValueError, ".*All SparseTensor and RaggedTensor inputs .*"):
-      _ = model.predict(input_data, **kwargs)
-
-  def test_subclass_implicit_sparse_scipy_inputs_fails(self):
-    # Create a model that accepts a sparse input and converts the sparse tensor
-    # back to a dense tensor.
-    layers = [ToDense(default_value=-1)]
-    model = testing_utils.get_model_from_layers(layers)
-
-    # Define some input data.
-    input_data = scipy.sparse.coo_matrix(([1, 2, 3], ([0, 1, 1], [0, 0, 1])),
-                                         shape=[2, 3])
-    with self.assertRaisesRegex(ValueError, ".*either a single array.*"):
-      _ = model.predict(input_data)
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/utils/generic_utils.py b/tensorflow/python/keras/utils/generic_utils.py
index 801f5ad99bc..edbfed6d776 100644
--- a/tensorflow/python/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/utils/generic_utils.py
@@ -539,7 +539,7 @@ class Progbar(object):
     self._start = time.time()
     self._last_update = 0
 
-  def update(self, current, values=None):
+  def update(self, current, values=None, finalize=None):
     """Updates the progress bar.
 
     Arguments:
@@ -547,7 +547,15 @@ class Progbar(object):
         values: List of tuples: `(name, value_for_last_step)`. If `name` is in
           `stateful_metrics`, `value_for_last_step` will be displayed as-is.
           Else, an average of the metric over time will be displayed.
+        finalize: Whether this is the last update for the progress bar. If
+          `None`, defaults to `current >= self.target`.
     """
+    if finalize is None:
+      if self.target is None:
+        finalize = False
+      else:
+        finalize = current >= self.target
+
     values = values or []
     for k, v in values:
       if k not in self._values_order:
@@ -573,8 +581,7 @@ class Progbar(object):
     now = time.time()
     info = ' - %.0fs' % (now - self._start)
     if self.verbose == 1:
-      if (now - self._last_update < self.interval and
-          self.target is not None and current < self.target):
+      if now - self._last_update < self.interval and not finalize:
         return
 
       prev_total_width = self._total_width
@@ -607,7 +614,15 @@ class Progbar(object):
         time_per_unit = (now - self._start) / current
       else:
         time_per_unit = 0
-      if self.target is not None and current < self.target:
+
+      if self.target is None or finalize:
+        if time_per_unit >= 1 or time_per_unit == 0:
+          info += ' %.0fs/%s' % (time_per_unit, self.unit_name)
+        elif time_per_unit >= 1e-3:
+          info += ' %.0fms/%s' % (time_per_unit * 1e3, self.unit_name)
+        else:
+          info += ' %.0fus/%s' % (time_per_unit * 1e6, self.unit_name)
+      else:
         eta = time_per_unit * (self.target - current)
         if eta > 3600:
           eta_format = '%d:%02d:%02d' % (eta // 3600,
@@ -618,13 +633,6 @@ class Progbar(object):
           eta_format = '%ds' % eta
 
         info = ' - ETA: %s' % eta_format
-      else:
-        if time_per_unit >= 1 or time_per_unit == 0:
-          info += ' %.0fs/%s' % (time_per_unit, self.unit_name)
-        elif time_per_unit >= 1e-3:
-          info += ' %.0fms/%s' % (time_per_unit * 1e3, self.unit_name)
-        else:
-          info += ' %.0fus/%s' % (time_per_unit * 1e6, self.unit_name)
 
       for k in self._values_order:
         info += ' - %s:' % k
@@ -641,14 +649,14 @@ class Progbar(object):
       if prev_total_width > self._total_width:
         info += (' ' * (prev_total_width - self._total_width))
 
-      if self.target is not None and current >= self.target:
+      if finalize:
         info += '\n'
 
       sys.stdout.write(info)
       sys.stdout.flush()
 
     elif self.verbose == 2:
-      if self.target is not None and current >= self.target:
+      if finalize:
         numdigits = int(np.log10(self.target)) + 1
         count = ('%' + str(numdigits) + 'd/%d') % (current, self.target)
         info = count + info
diff --git a/tensorflow/python/keras/utils/layer_utils.py b/tensorflow/python/keras/utils/layer_utils.py
index dcb42abf687..1dfd2f517c6 100644
--- a/tensorflow/python/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/utils/layer_utils.py
@@ -258,7 +258,6 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
     else:
       print_fn('_' * line_length)
 
-  model._check_trainable_weights_consistency()
   if hasattr(model, '_collected_trainable_weights'):
     trainable_count = count_params(model._collected_trainable_weights)
   else:
diff --git a/tensorflow/python/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
index 1a85b838be6..57b5c605db9 100644
--- a/tensorflow/python/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/utils/tf_utils.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import six
 
 from tensorflow.python.data.experimental.ops import cardinality
@@ -464,3 +465,27 @@ def dataset_is_infinite(dataset):
   else:
     dataset_size = K.get_session().run(cardinality.cardinality(dataset))
     return dataset_size == cardinality.INFINITE
+
+
+def get_tensor_spec(t, dynamic_batch=False, name=None):
+  """Returns a `TensorSpec` given a single `Tensor` or `TensorSpec`."""
+  if isinstance(t, type_spec.TypeSpec):
+    spec = t
+  elif isinstance(t, composite_tensor.CompositeTensor):
+    # TODO(b/148821952): Should these specs have a name attr?
+    spec = t._type_spec  # pylint: disable=protected-access
+  elif hasattr(t, 'shape') and hasattr(t, 'dtype'):
+    spec = tensor_spec.TensorSpec(shape=t.shape, dtype=t.dtype, name=name)
+  else:
+    return None  # Allow non-Tensors to pass through.
+
+  if not dynamic_batch:
+    return spec
+
+  dynamic_batch_spec = copy.deepcopy(spec)
+  # RaggedTensorSpec only has a private _shape.
+  shape = dynamic_batch_spec._shape.as_list()  # pylint: disable=protected-access
+  if shape:
+    shape[0] = None
+    dynamic_batch_spec._shape = tensor_shape.TensorShape(shape)  # pylint: disable=protected-access
+  return dynamic_batch_spec
diff --git a/tensorflow/python/keras/utils/tf_utils_test.py b/tensorflow/python/keras/utils/tf_utils_test.py
index 392ab7d59a5..2f87af2ef06 100644
--- a/tensorflow/python/keras/utils/tf_utils_test.py
+++ b/tensorflow/python/keras/utils/tf_utils_test.py
@@ -79,6 +79,8 @@ class TestIsSymbolicTensor(test.TestCase):
       self.assertTrue(tf_utils.is_symbolic_tensor(CustomClass()))
 
   def test_enables_nontensor_plumbing(self):
+    if context.executing_eagerly():
+      self.skipTest('`compile` functionality changed.')
     # Setup.
 
     class Foo(object):
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 28741d82bbc..33abd5c664e 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -552,7 +552,7 @@ class Layer(base_layer.Layer):
     return outputs
 
   def __deepcopy__(self, memo):
-    no_copy = set(['_graph', '_thread_local'])
+    no_copy = set(['_graph', '_thread_local', '_metrics_lock'])
     shallow_copy = set(['_scope', '_always_reuse_variable_scope'])
     cls = self.__class__
     result = cls.__new__(cls)
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index a823b172ace..440e6c8a5c4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -97,10 +97,6 @@ tf_class {
     name: "run_eagerly"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "sample_weights"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -195,7 +191,7 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
@@ -203,7 +199,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 77b0239181b..eee65bc6db4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -98,10 +98,6 @@ tf_class {
     name: "run_eagerly"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "sample_weights"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -200,7 +196,7 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
@@ -208,7 +204,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
index 4a6a96e3952..c64a1881f88 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -98,10 +98,6 @@ tf_class {
     name: "run_eagerly"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "sample_weights"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -196,7 +192,7 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
@@ -204,7 +200,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index 4c44837ef5f..238701103f7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -98,10 +98,6 @@ tf_class {
     name: "run_eagerly"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "sample_weights"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -196,7 +192,7 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
@@ -204,7 +200,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index c63d5ff76b3..788efce0063 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -97,10 +97,6 @@ tf_class {
     name: "run_eagerly"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "sample_weights"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -195,7 +191,7 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
@@ -203,7 +199,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index 6ca4124190d..6166b16f964 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -98,10 +98,6 @@ tf_class {
     name: "run_eagerly"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "sample_weights"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -200,7 +196,7 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
@@ -208,7 +204,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-progbar.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-progbar.pbtxt
index 8177cc71ed3..d7882583515 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-progbar.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-progbar.pbtxt
@@ -12,6 +12,6 @@ tf_class {
   }
   member_method {
     name: "update"
-    argspec: "args=[\'self\', \'current\', \'values\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'current\', \'values\', \'finalize\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index a823b172ace..440e6c8a5c4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -97,10 +97,6 @@ tf_class {
     name: "run_eagerly"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "sample_weights"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -195,7 +191,7 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
@@ -203,7 +199,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 77b0239181b..eee65bc6db4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -98,10 +98,6 @@ tf_class {
     name: "run_eagerly"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "sample_weights"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -200,7 +196,7 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
@@ -208,7 +204,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
index 4a6a96e3952..c64a1881f88 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -98,10 +98,6 @@ tf_class {
     name: "run_eagerly"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "sample_weights"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -196,7 +192,7 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
@@ -204,7 +200,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index 4c44837ef5f..238701103f7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -98,10 +98,6 @@ tf_class {
     name: "run_eagerly"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "sample_weights"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -196,7 +192,7 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
@@ -204,7 +200,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index c63d5ff76b3..788efce0063 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -97,10 +97,6 @@ tf_class {
     name: "run_eagerly"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "sample_weights"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -195,7 +191,7 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
@@ -203,7 +199,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index 6ca4124190d..6166b16f964 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -98,10 +98,6 @@ tf_class {
     name: "run_eagerly"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "sample_weights"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -200,7 +196,7 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'return_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
@@ -208,7 +204,7 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_batch_size\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-progbar.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-progbar.pbtxt
index 8177cc71ed3..d7882583515 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-progbar.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-progbar.pbtxt
@@ -12,6 +12,6 @@ tf_class {
   }
   member_method {
     name: "update"
-    argspec: "args=[\'self\', \'current\', \'values\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'current\', \'values\', \'finalize\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }

From 3aaf472deadce51914d60578b6b16d1e464707e6 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Wed, 19 Feb 2020 17:04:38 -0800
Subject: [PATCH 303/442] SVDF: Do the activation-state-shifting at the
 beginning of each Eval, sparing an extra reinitialization of the latest
 activation.

Note this does change behavior: the state tensor will be shifted in time - or more precisely, it will not be shifted after executing a single step.

PiperOrigin-RevId: 296091378
Change-Id: I24cd2bf0ece80d524a31271db61f6bcbdc40b5b9
---
 .../lite/kernels/internal/reference/svdf.h    | 62 +++++++--------
 tensorflow/lite/micro/kernels/svdf.cc         | 75 ++++++++-----------
 .../micro/kernels/xtensa-hifimini/svdf.cc     | 39 +++++-----
 3 files changed, 75 insertions(+), 101 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/reference/svdf.h b/tensorflow/lite/kernels/internal/reference/svdf.h
index 7016e3ab053..66b874447c6 100644
--- a/tensorflow/lite/kernels/internal/reference/svdf.h
+++ b/tensorflow/lite/kernels/internal/reference/svdf.h
@@ -73,18 +73,6 @@ static inline void ApplyTimeWeightsBiasAndActivation(
     tensor_utils::ApplyActivationToVector(output_ptr_batch, num_units,
                                           activation, output_ptr_batch);
   }
-
-  // Left shift the activation_state to make room for next cycle's activation.
-  // TODO(alanchiao): explore collapsing this into a single loop.
-  for (int b = 0; b < batch_size; ++b) {
-    float* state_ptr_batch =
-        GetTensorData<float>(activation_state) + b * memory_size * num_filters;
-    for (int f = 0; f < num_filters; ++f) {
-      tensor_utils::VectorShiftLeft(state_ptr_batch, memory_size,
-                                    /*shift_value=*/0.0f);
-      state_ptr_batch += memory_size;
-    }
-  }
 }
 
 inline void EvalIntegerSVDF(
@@ -102,6 +90,19 @@ inline void EvalIntegerSVDF(
   const int n_unit = n_filter / n_rank;
   const int n_memory = weights_time_tensor->dims->data[1];
 
+  // Shift state.
+  {
+    int16_t zero = 0;
+    for (int b = 0; b < n_batch; ++b) {
+      int16_t* state_ptr_batch =
+          GetTensorData<int16_t>(state_tensor) + b * n_memory * n_filter;
+      for (int f = 0; f < n_filter; ++f) {
+        tensor_utils::VectorShiftLeft(state_ptr_batch, n_memory, zero);
+        state_ptr_batch += n_memory;
+      }
+    }
+  }
+
   // Feature matmul.
   {
     int16_t* state = GetTensorData<int16_t>(state_tensor);
@@ -176,19 +177,6 @@ inline void EvalIntegerSVDF(
       GetTensorData<int8_t>(output_tensor)[i] = static_cast<int8_t>(x4);
     }
   }
-
-  // Shift state.
-  {
-    int16_t zero = 0;
-    for (int b = 0; b < n_batch; ++b) {
-      int16_t* state_ptr_batch =
-          GetTensorData<int16_t>(state_tensor) + b * n_memory * n_filter;
-      for (int f = 0; f < n_filter; ++f) {
-        tensor_utils::VectorShiftLeft(state_ptr_batch, n_memory, zero);
-        state_ptr_batch += n_memory;
-      }
-    }
-  }
 }
 
 inline void EvalFloatSVDF(TfLiteContext* context, TfLiteNode* node,
@@ -205,15 +193,15 @@ inline void EvalFloatSVDF(TfLiteContext* context, TfLiteNode* node,
   const int num_units = num_filters / rank;
   const int memory_size = weights_time->dims->data[1];
 
-  // Clear the activation (state's leftmost column).
-  // TODO(ghodrat): Add a test which initialize activation_state with invalid
-  // values in leftmost column and make sure it passes.
+  // Left shift the activation_state, and clear the latest activation (the
+  // rightmost column).
   for (int b = 0; b < batch_size; ++b) {
     float* state_ptr_batch =
         GetTensorData<float>(state) + b * memory_size * num_filters;
-    for (int c = 0; c < num_filters; ++c) {
-      float* state_ptr = state_ptr_batch + c * memory_size;
-      state_ptr[memory_size - 1] = 0.0f;
+    for (int f = 0; f < num_filters; ++f) {
+      tensor_utils::VectorShiftLeft(state_ptr_batch, memory_size,
+                                    /*shift_value=*/0.0f);
+      state_ptr_batch += memory_size;
     }
   }
 
@@ -258,15 +246,15 @@ inline void EvalHybridSVDF(
   // Initialize the weights scale.
   const float weights_feature_scale = weights_feature->params.scale;
 
-  // Clear the activation (state's leftmost column).
-  // TODO(ghodrat): Add a test which initialize state with invalid values in
-  // the leftmost column and make sure it passes.
+  // Left shift the activation_state, and clear the latest activation (the
+  // rightmost column).
   for (int b = 0; b < batch_size; ++b) {
     float* state_ptr_batch =
         GetTensorData<float>(state) + b * memory_size * num_filters;
-    for (int c = 0; c < num_filters; ++c) {
-      float* state_ptr = state_ptr_batch + c * memory_size;
-      state_ptr[memory_size - 1] = 0.0;
+    for (int f = 0; f < num_filters; ++f) {
+      tensor_utils::VectorShiftLeft(state_ptr_batch, memory_size,
+                                    /*shift_value=*/0.0f);
+      state_ptr_batch += memory_size;
     }
   }
 
diff --git a/tensorflow/lite/micro/kernels/svdf.cc b/tensorflow/lite/micro/kernels/svdf.cc
index 85f8280d1e1..d00e0dc656c 100644
--- a/tensorflow/lite/micro/kernels/svdf.cc
+++ b/tensorflow/lite/micro/kernels/svdf.cc
@@ -120,24 +120,6 @@ static inline void ApplyTimeWeightsBiasAndActivation(
       ++output_ptr_batch;
     }
   }
-
-  // Left shift the activation_state to make room for next cycle's activation.
-  // TODO(alanchiao): explore collapsing this into a single loop.
-  for (int b = 0; b < batch_size; ++b) {
-    float* state_ptr_batch =
-        GetTensorData<float>(activation_state) + b * memory_size * num_filters;
-    for (int f = 0; f < num_filters; ++f) {
-      // Shift the vector left:
-      float* batch_ptr = state_ptr_batch;
-      float* batch_start = state_ptr_batch + 1;
-      float* batch_end = state_ptr_batch + memory_size;
-      while (batch_start != batch_end) {
-        *batch_ptr++ = *batch_start++;
-      }
-      state_ptr_batch[memory_size - 1] = 0.0f;
-      state_ptr_batch += memory_size;
-    }
-  }
 }
 
 inline void EvalFloatSVDF(TfLiteContext* context, TfLiteNode* node,
@@ -155,15 +137,21 @@ inline void EvalFloatSVDF(TfLiteContext* context, TfLiteNode* node,
   const int num_units = num_filters / rank;
   const int memory_size = weights_time->dims->data[1];
 
-  // Clear the activation (activation_state's leftmost column).
-  // TODO(ghodrat): Add a test which initialize activation_state with invalid
-  // values in leftmost column and make sure it passes.
+  // Left shift the activation_state, and clear the latest activation (the
+  // rightmost column).
   for (int b = 0; b < batch_size; ++b) {
     float* state_ptr_batch =
         GetTensorData<float>(activation_state) + b * memory_size * num_filters;
-    for (int c = 0; c < num_filters; ++c) {
-      float* state_ptr = state_ptr_batch + c * memory_size;
-      state_ptr[memory_size - 1] = 0.0f;
+    for (int f = 0; f < num_filters; ++f) {
+      // Shift the vector left:
+      float* batch_ptr = state_ptr_batch;
+      float* batch_start = state_ptr_batch + 1;
+      float* batch_end = state_ptr_batch + memory_size;
+      while (batch_start != batch_end) {
+        *batch_ptr++ = *batch_start++;
+      }
+      state_ptr_batch[memory_size - 1] = 0.0f;
+      state_ptr_batch += memory_size;
     }
   }
 
@@ -215,6 +203,25 @@ void EvalIntegerSVDF(
   int32_t scratch_tensor[kScratchTensorMaxSize];
   int32_t scratch_output_tensor[kScratchTensorMaxSize];
 
+  // Shift states. No need to set last state, the matmul is not accumulative.
+  {
+    for (int b = 0; b < n_batch; ++b) {
+      int16_t* state_ptr_batch =
+          GetTensorData<int16_t>(activation_state_tensor) +
+          b * n_memory * n_filter;
+      for (int f = 0; f < n_filter; ++f) {
+        // Shift the vector left:
+        int16_t* batch_ptr = state_ptr_batch;
+        int16_t* batch_start = state_ptr_batch + 1;
+        int16_t* batch_end = state_ptr_batch + n_memory;
+        while (batch_start != batch_end) {
+          *batch_ptr++ = *batch_start++;
+        }
+        state_ptr_batch += n_memory;
+      }
+    }
+  }
+
   // Feature matmul.
   {
     int16_t* state = GetTensorData<int16_t>(activation_state_tensor);
@@ -312,26 +319,6 @@ void EvalIntegerSVDF(
       GetTensorData<int8_t>(output_tensor)[i] = static_cast<int8_t>(x4);
     }
   }
-
-  // Shift state.
-  {
-    for (int b = 0; b < n_batch; ++b) {
-      int16_t* state_ptr_batch =
-          GetTensorData<int16_t>(activation_state_tensor) +
-          b * n_memory * n_filter;
-      for (int f = 0; f < n_filter; ++f) {
-        // Shift the vector left:
-        int16_t* batch_ptr = state_ptr_batch;
-        int16_t* batch_start = state_ptr_batch + 1;
-        int16_t* batch_end = state_ptr_batch + n_memory;
-        while (batch_start != batch_end) {
-          *batch_ptr++ = *batch_start++;
-        }
-        state_ptr_batch[n_memory - 1] = 0;
-        state_ptr_batch += n_memory;
-      }
-    }
-  }
 }
 
 }  // namespace
diff --git a/tensorflow/lite/micro/kernels/xtensa-hifimini/svdf.cc b/tensorflow/lite/micro/kernels/xtensa-hifimini/svdf.cc
index d0901e5a2bc..1a0b0fe12c8 100644
--- a/tensorflow/lite/micro/kernels/xtensa-hifimini/svdf.cc
+++ b/tensorflow/lite/micro/kernels/xtensa-hifimini/svdf.cc
@@ -75,6 +75,25 @@ void EvalIntegerSVDF(
   int32_t scratch_tensor[kScratchTensorMaxSize];
   int32_t scratch_output_tensor[kScratchTensorMaxSize];
 
+  // Shift states. No need to set last state, the matmul is not accumulative.
+  {
+    for (int b = 0; b < n_batch; ++b) {
+      int16_t* state_ptr_batch =
+          GetTensorData<int16_t>(activation_state_tensor) +
+          b * n_memory * n_filter;
+      for (int f = 0; f < n_filter; ++f) {
+        // Shift the vector left:
+        int16_t* batch_ptr = state_ptr_batch;
+        int16_t* batch_start = state_ptr_batch + 1;
+        int16_t* batch_end = state_ptr_batch + n_memory;
+        while (batch_start != batch_end) {
+          *batch_ptr++ = *batch_start++;
+        }
+        state_ptr_batch += n_memory;
+      }
+    }
+  }
+
   // Feature matmul.
   {
     int16_t* state = GetTensorData<int16_t>(activation_state_tensor);
@@ -231,26 +250,6 @@ void EvalIntegerSVDF(
           static_cast<int8_t>(AE_TRUNCA32Q48(x_56));
     }
   }
-
-  // Shift state.
-  {
-    for (int b = 0; b < n_batch; ++b) {
-      int16_t* state_ptr_batch =
-          GetTensorData<int16_t>(activation_state_tensor) +
-          b * n_memory * n_filter;
-      for (int f = 0; f < n_filter; ++f) {
-        // Shift the vector left:
-        int16_t* batch_ptr = state_ptr_batch;
-        int16_t* batch_start = state_ptr_batch + 1;
-        int16_t* batch_end = state_ptr_batch + n_memory;
-        while (batch_start != batch_end) {
-          *batch_ptr++ = *batch_start++;
-        }
-        state_ptr_batch[n_memory - 1] = 0;
-        state_ptr_batch += n_memory;
-      }
-    }
-  }
 }
 
 }  // namespace

From 14d78c545078805f0cb8edbc67f463e0c969a464 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Wed, 19 Feb 2020 17:10:13 -0800
Subject: [PATCH 304/442] Automated rollback of commit
 9f86c8c5a42e51b42880b69cde7f43f60d7276cc

PiperOrigin-RevId: 296092477
Change-Id: I12be08294a71f885a67760098b732a47ff595384
---
 .../core/profiler/internal/cpu/host_tracer.cc |  30 +++--
 .../profiler/internal/cpu/host_tracer_test.cc |   1 +
 tensorflow/core/profiler/internal/gpu/BUILD   |   1 +
 .../profiler/internal/gpu/device_tracer.cc    | 117 +++++++-----------
 .../internal/gpu/device_tracer_test.cc        |   1 +
 .../profiler/internal/profiler_interface.h    |   2 +-
 6 files changed, 60 insertions(+), 92 deletions(-)

diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer.cc b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
index 4d54093a1e2..998855532f9 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
@@ -16,8 +16,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_split.h"
-#include "tensorflow/core/framework/step_stats.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/profiler/internal/cpu/host_tracer_utils.h"
@@ -78,11 +77,11 @@ HostTracer::~HostTracer() { Stop().IgnoreError(); }
 
 Status HostTracer::Start() {
   if (recording_) {
-    return errors::Internal("TraceMeRecorder already started");
+    return Status(error::INTERNAL, "TraceMeRecorder already started");
   }
   recording_ = TraceMeRecorder::Start(host_trace_level_);
   if (!recording_) {
-    return errors::Internal("Failed to start TraceMeRecorder");
+    return Status(error::INTERNAL, "Failed to start TraceMeRecorder");
   }
   start_timestamp_ns_ = EnvTime::NowNanos();
   return Status::OK();
@@ -90,7 +89,7 @@ Status HostTracer::Start() {
 
 Status HostTracer::Stop() {
   if (!recording_) {
-    return errors::Internal("TraceMeRecorder not started");
+    return Status(error::INTERNAL, "TraceMeRecorder not started");
   }
   events_ = TraceMeRecorder::Stop();
   recording_ = false;
@@ -102,19 +101,16 @@ Status HostTracer::CollectData(RunMetadata* run_metadata) {
     return errors::Internal("TraceMeRecorder not stopped");
   }
   MakeCompleteEvents(&events_);
-
-  StepStats* step_stats = run_metadata->mutable_step_stats();
-  DeviceStepStats* dev_stats = step_stats->add_dev_stats();
-  dev_stats->set_device("/host:CPU");
-  auto* thread_names = dev_stats->mutable_thread_names();
+  StepStatsCollector step_stats_collector(run_metadata->mutable_step_stats());
 
   constexpr char kUserMetadataMarker = '#';
-  for (TraceMeRecorder::ThreadEvents& thread : events_) {
-    uint32_t thread_id = thread.thread.tid;
-    thread_names->insert({thread_id, thread.thread.name});
-    for (TraceMeRecorder::Event& event : thread.events) {
+  const string cpu_name = "/host:CPU";
+  for (auto& thread : events_) {
+    step_stats_collector.SaveThreadName(cpu_name, thread.thread.tid,
+                                        thread.thread.name);
+    for (auto& event : thread.events) {
       if (event.start_time && event.end_time) {
-        NodeExecStats* ns = dev_stats->add_node_stats();
+        NodeExecStats* ns = new NodeExecStats;
         if (event.name.back() != kUserMetadataMarker) {
           ns->set_node_name(std::move(event.name));
         } else {
@@ -131,11 +127,13 @@ Status HostTracer::CollectData(RunMetadata* run_metadata) {
         ns->set_all_start_micros(event.start_time / EnvTime::kMicrosToNanos);
         ns->set_all_end_rel_micros((event.end_time - event.start_time) /
                                    EnvTime::kMicrosToNanos);
-        ns->set_thread_id(thread_id);
+        ns->set_thread_id(thread.thread.tid);
+        step_stats_collector.Save(cpu_name, ns);
       }
     }
   }
   events_.clear();
+  step_stats_collector.Finalize();
   return Status::OK();
 }
 
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
index 412038df9b1..cbf4a9750a3 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/types/optional.h"
+#include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/env.h"
diff --git a/tensorflow/core/profiler/internal/gpu/BUILD b/tensorflow/core/profiler/internal/gpu/BUILD
index 6fc78e46862..c25a6ac0cfd 100644
--- a/tensorflow/core/profiler/internal/gpu/BUILD
+++ b/tensorflow/core/profiler/internal/gpu/BUILD
@@ -31,6 +31,7 @@ tf_cuda_library(
     ],
     deps = [
         ":cupti_utils",
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler/internal:annotation_stack",
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
index 5244134b59a..50a901f3670 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <stdlib.h>
 
 #include <memory>
-#include <utility>
 
 #include "absl/container/fixed_array.h"
 #include "absl/container/flat_hash_map.h"
@@ -26,13 +25,10 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
-#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/abi.h"
-#include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/profiler/internal/annotation_stack.h"
 #include "tensorflow/core/profiler/internal/gpu/cupti_tracer.h"
 #include "tensorflow/core/profiler/internal/gpu/cupti_wrapper.h"
@@ -194,13 +190,13 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
   }
   void OnEventsDropped(const std::string& reason, uint32 num_events) override {}
   void Flush() override {}
-  void Export(StepStats* step_stats) {
+  void Export(StepStatsCollector* trace_collector) {
     LOG(INFO) << " GpuTracer has collected " << num_callback_events_
               << " callback api events and " << num_activity_events_
               << " activity events.";
     for (int i = 0; i < num_gpus_; ++i) {
       per_device_collector_[i].Flush(i, start_walltime_ns_, start_gpu_ns_,
-                                     step_stats);
+                                     trace_collector);
     }
   }
   void Export(XSpace* space) {
@@ -248,7 +244,7 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
   };
   struct PerDeviceCollector {
     void AddEvent(CuptiTracerEvent&& event) {
-      mutex_lock l(m);
+      absl::MutexLock lock(&mutex);
       if (event.source == CuptiTracerEventSource::DriverCallback) {
         // Cupti api callback events were used to populate launch times etc.
         if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
@@ -264,16 +260,12 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
     }
 
     void Flush(int32 device_ordinal, uint64 start_walltime_ns,
-               uint64 start_gpu_ns, StepStats* step_stats) {
-      mutex_lock l(m);
-      absl::flat_hash_map<std::pair<int64 /*stream_id*/, CuptiTracerEventType>,
-                          DeviceStepStats*>
-          stream_dev_stats_map;
-      DeviceStepStats* unknown_stream_dev_stats = nullptr;
-      DeviceStepStats* all_streams_dev_stats = nullptr;
-      DeviceStepStats* memcpy_dev_stats = nullptr;
-      DeviceStepStats* sync_dev_stats = nullptr;
-      for (const CuptiTracerEvent& event : events) {
+               uint64 start_gpu_ns, StepStatsCollector* collector) {
+      absl::MutexLock lock(&mutex);
+      stream_device = absl::StrCat("/device:GPU:", device_ordinal, "/stream:");
+      memcpy_device = absl::StrCat("/device:GPU:", device_ordinal, "/memcpy");
+      sync_device = absl::StrCat("/device:GPU:", device_ordinal, "/sync");
+      for (auto& event : events) {
         NodeExecStats* ns = new NodeExecStats;
         ns->set_all_start_micros(
             (start_walltime_ns + (event.start_time_ns - start_gpu_ns)) / 1000);
@@ -289,12 +281,7 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
             ns->set_node_name(event.name);
             ns->set_timeline_label(absl::StrCat("ThreadId ", event.thread_id));
             ns->set_thread_id(event.thread_id);
-            if (sync_dev_stats == nullptr) {
-              sync_dev_stats = step_stats->add_dev_stats();
-              sync_dev_stats->set_device(
-                  absl::StrCat("/device:GPU:", device_ordinal, "/sync"));
-            }
-            sync_dev_stats->add_node_stats()->Swap(ns);
+            collector->Save(sync_device, ns);
           }
         } else {  // CuptiTracerEventSource::Activity
           // Get launch information if available.
@@ -315,30 +302,19 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
           ns->set_node_name(activity_name);
           switch (event.type) {
             case CuptiTracerEventType::Kernel: {
-              ns->set_timeline_label(absl::StrFormat(
-                  "%s regs:%u shm:%u grid:%u,%u,%u block:%u,%u,%u@@%s",
-                  kernel_name, event.kernel_info.registers_per_thread,
+              const std::string details = absl::StrFormat(
+                  "regs:%u shm:%u grid:%u,%u,%u block:%u,%u,%u",
+                  event.kernel_info.registers_per_thread,
                   event.kernel_info.static_shared_memory_usage,
                   event.kernel_info.grid_x, event.kernel_info.grid_y,
                   event.kernel_info.grid_z, event.kernel_info.block_x,
-                  event.kernel_info.block_y, event.kernel_info.block_z,
-                  event.annotation));
-              DeviceStepStats*& stream_dev_stats =
-                  stream_dev_stats_map[std::make_pair(event.stream_id,
-                                                      event.type)];
-              if (stream_dev_stats == nullptr) {
-                stream_dev_stats = step_stats->add_dev_stats();
-                stream_dev_stats->set_device(
-                    absl::StrCat("/device:GPU:", device_ordinal,
-                                 "/stream:", event.stream_id));
-              }
-              *stream_dev_stats->add_node_stats() = *ns;
-              if (all_streams_dev_stats == nullptr) {
-                all_streams_dev_stats = step_stats->add_dev_stats();
-                all_streams_dev_stats->set_device(absl::StrCat(
-                    "/device:GPU:", device_ordinal, "/stream:all"));
-              }
-              all_streams_dev_stats->add_node_stats()->Swap(ns);
+                  event.kernel_info.block_y, event.kernel_info.block_z);
+              ns->set_timeline_label(absl::StrCat(kernel_name, " ", details,
+                                                  "@@", event.annotation));
+              auto nscopy = new NodeExecStats(*ns);
+              collector->Save(absl::StrCat(stream_device, "all"), ns);
+              collector->Save(absl::StrCat(stream_device, event.stream_id),
+                              nscopy);
               break;
             }
             case CuptiTracerEventType::MemcpyH2D:
@@ -355,33 +331,17 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
                                 " to device:", event.memcpy_info.destination);
               }
               ns->set_timeline_label(std::move(details));
-              DeviceStepStats*& stream_dev_stats =
-                  stream_dev_stats_map[std::make_pair(event.stream_id,
-                                                      event.type)];
-              if (stream_dev_stats == nullptr) {
-                stream_dev_stats = step_stats->add_dev_stats();
-                stream_dev_stats->set_device(absl::StrCat(
-                    "/device:GPU:", device_ordinal, "/stream:", event.stream_id,
-                    "<", GetTraceEventTypeName(event.type), ">"));
-              }
-              *stream_dev_stats->add_node_stats() = *ns;
-              if (memcpy_dev_stats == nullptr) {
-                memcpy_dev_stats = step_stats->add_dev_stats();
-                memcpy_dev_stats->set_device(
-                    absl::StrCat("/device:GPU:", device_ordinal, "/memcpy"));
-              }
-              memcpy_dev_stats->add_node_stats()->Swap(ns);
+              auto nscopy = new NodeExecStats(*ns);
+              collector->Save(memcpy_device, ns);
+              collector->Save(
+                  absl::StrCat(stream_device, event.stream_id, "<",
+                               GetTraceEventTypeName(event.type), ">"),
+                  nscopy);
               break;
             }
             default:
               ns->set_timeline_label(activity_name);
-              if (unknown_stream_dev_stats == nullptr) {
-                unknown_stream_dev_stats = step_stats->add_dev_stats();
-                unknown_stream_dev_stats->set_device(
-                    absl::StrCat("/device:GPU:", device_ordinal, "/stream:"));
-              }
-              unknown_stream_dev_stats->add_node_stats()->Swap(ns);
-              break;
+              collector->Save(stream_device, ns);
           }
         }
       }
@@ -390,7 +350,8 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
 
     void Flush(uint64 start_gpu_ns, XPlaneBuilder* device_plane,
                XPlaneBuilder* host_plane) {
-      mutex_lock l(m);
+      absl::MutexLock lock(&mutex);
+
       // Tracking event types per line.
       absl::flat_hash_map<int64, absl::flat_hash_set<CuptiTracerEventType>>
           events_types_per_line;
@@ -478,9 +439,13 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
       }
     }
 
-    mutex m;
-    std::vector<CuptiTracerEvent> events GUARDED_BY(m);
-    absl::flat_hash_map<uint32, CorrelationInfo> correlation_info GUARDED_BY(m);
+    absl::Mutex mutex;
+    std::string stream_device GUARDED_BY(mutex);
+    std::string memcpy_device GUARDED_BY(mutex);
+    std::string sync_device GUARDED_BY(mutex);
+    std::vector<CuptiTracerEvent> events GUARDED_BY(mutex);
+    absl::flat_hash_map<uint32, CorrelationInfo> correlation_info
+        GUARDED_BY(mutex);
   };
   absl::FixedArray<PerDeviceCollector> per_device_collector_;
 
@@ -520,6 +485,7 @@ class GpuTracer : public profiler::ProfilerInterface {
 
   CuptiTracer* cupti_tracer_;
   CuptiTracerOptions options_;
+  StepStats step_stats_;
   std::unique_ptr<CuptiTraceCollectorImpl> cupti_collector_;
 };
 
@@ -629,11 +595,12 @@ Status GpuTracer::CollectData(RunMetadata* run_metadata) {
       return Status::OK();
     case State::kStoppedOk: {
       // Input run_metadata is shared by profiler interfaces, we need append.
-      StepStats step_stats;
+      StepStatsCollector step_stats_collector(&step_stats_);
       if (cupti_collector_) {
-        cupti_collector_->Export(&step_stats);
+        cupti_collector_->Export(&step_stats_collector);
       }
-      for (auto& dev_stats : *step_stats.mutable_dev_stats()) {
+      step_stats_collector.Finalize();
+      for (auto& dev_stats : *step_stats_.mutable_dev_stats()) {
         run_metadata->mutable_step_stats()->add_dev_stats()->Swap(&dev_stats);
       }
       return Status::OK();
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc b/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc
index 24f8d8771fb..e796a1ac0b7 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/direct_session.h"
+#include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/profiler/internal/profiler_interface.h b/tensorflow/core/profiler/internal/profiler_interface.h
index 081054f03fd..dc8060082f6 100644
--- a/tensorflow/core/profiler/internal/profiler_interface.h
+++ b/tensorflow/core/profiler/internal/profiler_interface.h
@@ -58,7 +58,7 @@ class ProfilerInterface {
   // Stops profiling.
   virtual Status Stop() = 0;
 
-  // Saves collected profile data into run_metadata.
+  // Saves collected profile data into step_stats_collector.
   // After this or the overload below are called once, subsequent calls might
   // return empty data.
   virtual Status CollectData(RunMetadata* run_metadata) = 0;

From 34a38afdeec931c08214316beafda005312c46a8 Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Wed, 19 Feb 2020 17:15:40 -0800
Subject: [PATCH 305/442] Error out when saving IndexLookup layer.

PiperOrigin-RevId: 296093473
Change-Id: I96ce0319a1480399c47d10ecc048007483eca595
---
 .../layers/preprocessing/index_lookup_test.py | 14 ++++++++++++++
 tensorflow/python/keras/saving/hdf5_format.py | 19 +++++++++++++------
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
index d0493ed3b95..fbb6062ce0b 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers.preprocessing import index_lookup
 from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
@@ -453,6 +454,19 @@ class IndexLookupSaveableTest(keras_parameterized.TestCase,
     weights = model.get_weights()
     model.set_weights(weights)
 
+  def test_layer_saving_with_h5(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(max_tokens=10)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    path = os.path.join(self.get_temp_dir(), "model")
+    with self.assertRaisesRegex(NotImplementedError,
+                                "Save or restore weights that is not.*"):
+      save.save_model(model, path, save_format="h5")
+
 
 @keras_parameterized.run_all_keras_modes
 class IndexLookupErrorTest(keras_parameterized.TestCase,
diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index b8a66fa59dd..8b0893a598a 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -31,6 +31,7 @@ from tensorflow.python.keras.saving import model_config as model_config_lib
 from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
+from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import serialization
 
@@ -851,22 +852,28 @@ def load_attributes_from_hdf5_group(group, name):
   return data
 
 
-def _legacy_weights(model):
+def _legacy_weights(layer):
   """DO NOT USE.
 
-  For legacy reason, the model.weights was in the order of
+  For legacy reason, the layer.weights was in the order of
   [self.trainable_weights + self.non_trainable_weights], and this order was
-  used for preserving the weights in h5 format. The new order of model.weights
-  are the same as model.get_weights() which is more intuitive for user. To
+  used for preserving the weights in h5 format. The new order of layer.weights
+  are the same as layer.get_weights() which is more intuitive for user. To
   keep supporting the existing saved h5 file, this method should be used to
   save/load weights. In future version, we will delete this method and
   introduce a breaking change for h5 and stay with the new order for weights.
 
   Args:
-    model: a model or layer instance.
+    layer: a `tf.keras.Model` or `tf.keras.layers.Layer` instance.
 
   Returns:
     A list of variables with the order of trainable_weights, followed by
       non_trainable_weights.
   """
-  return model.trainable_weights + model.non_trainable_weights
+  weights = layer.trainable_weights + layer.non_trainable_weights
+  if any([not isinstance(w, variables_module.Variable) for w in weights]):
+    raise NotImplementedError(
+        'Save or restore weights that is not an instance of `tf.Variable` is '
+        'not supported in h5, use `save_format=\'tf\'` instead. Got a model '
+        'or layer {} with weights {}'.format(layer.__class__.__name__, weights))
+  return weights

From 37d4d0484cbb516875e97edfd482d3934aee9d45 Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliuai@google.com>
Date: Wed, 19 Feb 2020 17:15:59 -0800
Subject: [PATCH 306/442] Fuse hardwish for mobilenet v3

The mobilenet v3 frozen graph has extra FakeQuant ops which blocks the fusion,
thus we create a special pattern to remove the redundant FakeQuant ops.

PiperOrigin-RevId: 296093529
Change-Id: Ic5bc6808afb12b2004ed7b6f3a81f914df917d5e
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td      |  2 +-
 .../mlir/lite/transforms/optimize_patterns.td    | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 3bb2b67be35..a04e1d44ea6 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -1349,7 +1349,7 @@ def TFL_GreaterOp : TFL_Op<"greater", [
 }
 
 def TFL_HardSwishOp: TFL_Op<"hard_swish", [NoSideEffect,
-                                          SameOperandsAndResultType]> {
+                                          SameOperandsAndResultShape]> {
   let summary = "Hardswish activation function.";
   let description = [{
     Computes hard-swish activation function
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index bdf73ff3787..71017fe2801 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -199,6 +199,22 @@ def : Pat<
   (TFL_HardSwishOp $x),
   [(EqualOperands $x, $y)]>;
 
+// Matching HardSwish with extra FakeQuant. These FakeQuant ops were due to
+// incorrect placement in the quantization aware training.
+// TODO(b/149735743): We should make the placement automatically.
+def : Pat<
+  (TFL_MulOp (TFL_DequantizeOp (TFL_QuantizeOp
+    (TFL_MulOp
+     $x, (TFL_DequantizeOp (TFL_QuantizeOp (TFL_AddOp
+          $y,
+          (ConstantOp ConstantAttr<RankedF32ElementsAttr<[]>, "3.0f">),
+          TFL_AF_Relu6), $qattr2)),
+     TFL_AF_None), $qattr1)),
+    (ConstantOp ConstantAttr<RankedF32ElementsAttr<[]>, "0.166666666f">),
+     TFL_AF_None),
+  (TFL_HardSwishOp $x),
+  [(EqualOperands $x, $y)]>;
+
 // Constraint that the attribute value is less than 'n'
 class ConstDoubleValueLessThan<string n> : Constraint<
   CPred<"$0.isa<DenseElementsAttr>() && "

From 10568a537f479732e87fb4e571e0937f51953ec7 Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Wed, 19 Feb 2020 17:16:52 -0800
Subject: [PATCH 307/442] Use Env::LocalTempFilename for a temp filename.

This function works both in and outside of tests. Additionally,
LocalTempFilename works well on Windows where as TmpDir is a little problematic
because of bazel oddities.

PiperOrigin-RevId: 296093672
Change-Id: I998ac9dab0077d7cfc09631db4bf295f4eef155a
---
 tensorflow/compiler/xla/BUILD                       | 1 +
 tensorflow/compiler/xla/text_literal_writer_test.cc | 9 +++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index dd9f83bf26e..01f35df0e20 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -722,6 +722,7 @@ tf_cc_test(
         ":text_literal_writer",
         ":types",
         "//tensorflow/core:lib",
+        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
diff --git a/tensorflow/compiler/xla/text_literal_writer_test.cc b/tensorflow/compiler/xla/text_literal_writer_test.cc
index 5cbaf2fcc19..667d6296117 100644
--- a/tensorflow/compiler/xla/text_literal_writer_test.cc
+++ b/tensorflow/compiler/xla/text_literal_writer_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -35,12 +36,12 @@ TEST(TextLiteralWriterTest, WritesFloatLiteral) {
       {3.14, 2.17},
       {1.23, 4.56},
   });
-  string path =
-      tensorflow::io::JoinPath(tensorflow::testing::TmpDir(), "/whatever");
+  string path;
+  ASSERT_TRUE(tensorflow::Env::Default()->LocalTempFilename(&path));
   ASSERT_IS_OK(TextLiteralWriter::WriteToPath(literal, path));
   string contents;
-  TF_CHECK_OK(tensorflow::ReadFileToString(tensorflow::Env::Default(), path,
-                                           &contents));
+  TF_ASSERT_OK(tensorflow::ReadFileToString(tensorflow::Env::Default(), path,
+                                            &contents));
   const string expected = R"(f32[2,2]
 (0, 0): 3.14
 (0, 1): 2.17

From e23ef990f925ec94b6d1556e9fc7b49bf6449ff9 Mon Sep 17 00:00:00 2001
From: Jonathan DEKHTIAR <contact@jonathandekhtiar.eu>
Date: Wed, 19 Feb 2020 17:32:07 -0800
Subject: [PATCH 308/442] Fix Bazel not building anymore with the commit
 09fe958f

---
 configure.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.py b/configure.py
index 64956049c34..ed09a693fd4 100644
--- a/configure.py
+++ b/configure.py
@@ -49,7 +49,7 @@ _TF_BAZELRC_FILENAME = '.tf_configure.bazelrc'
 _TF_WORKSPACE_ROOT = ''
 _TF_BAZELRC = ''
 _TF_CURRENT_BAZEL_VERSION = None
-_TF_MIN_BAZEL_VERSION = '1.2.1'
+_TF_MIN_BAZEL_VERSION = '2.0.0'
 _TF_MAX_BAZEL_VERSION = '2.0.0'
 
 NCCL_LIB_PATHS = [

From b56c66d88342db9165eef787eff3d46c764c98ef Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 17:19:24 -0800
Subject: [PATCH 309/442] Make map_xla_to_scalar_op as a library.

PiperOrigin-RevId: 296094105
Change-Id: I584fceb8adafded0b44b0fc0fb5acb336d6a35d2
---
 tensorflow/compiler/mlir/xla/BUILD | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index a4115479a0b..0e912a30ab0 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -131,12 +131,24 @@ cc_library(
 )
 
 cc_library(
-    name = "lhlo_legalize_to_affine",
-    srcs = ["transforms/lhlo_legalize_to_affine.cc"],
+    name = "map_xla_to_scalar_op",
+    srcs = [],
     hdrs = ["transforms/map_xla_to_scalar_op.h"],
     deps = [
         ":hlo",
         ":lhlo",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:StandardOps",
+    ],
+)
+
+cc_library(
+    name = "lhlo_legalize_to_affine",
+    srcs = ["transforms/lhlo_legalize_to_affine.cc"],
+    deps = [
+        ":hlo",
+        ":lhlo",
+        ":map_xla_to_scalar_op",
         "//tensorflow/compiler/xla:status",
         "@com_google_absl//absl/memory",
         "@llvm-project//llvm:support",
@@ -151,10 +163,10 @@ cc_library(
 cc_library(
     name = "xla_legalize_to_linalg",
     srcs = ["transforms/xla_legalize_to_linalg.cc"],
-    hdrs = ["transforms/map_xla_to_scalar_op.h"],
     deps = [
         ":hlo",
         ":lhlo",
+        ":map_xla_to_scalar_op",
         "@com_google_absl//absl/memory",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:IR",
@@ -169,10 +181,10 @@ cc_library(
 cc_library(
     name = "lhlo_legalize_to_gpu",
     srcs = ["transforms/lhlo_legalize_to_gpu.cc"],
-    hdrs = ["transforms/map_xla_to_scalar_op.h"],
     deps = [
         ":hlo",
         ":lhlo",
+        ":map_xla_to_scalar_op",
         "@com_google_absl//absl/memory",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:GPUDialect",

From 8b7a3db0b6e09415b5640be4986fb4d7c6e5209a Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Wed, 19 Feb 2020 17:20:19 -0800
Subject: [PATCH 310/442] [XLA] Respect TF_DETERMINISTIC_OPS environment
 variable for reductions

PiperOrigin-RevId: 296094275
Change-Id: Iadcbf33d5d6432413c86d4d176865980de252eeb
---
 tensorflow/compiler/xla/service/gpu/BUILD          |  1 +
 .../compiler/xla/service/gpu/amdgpu_compiler.cc    |  1 +
 .../compiler/xla/service/gpu/nvptx_compiler.cc     | 14 +++++++++++++-
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index c812272829a..28e33b2a17e 100755
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -1285,6 +1285,7 @@ cc_library(
         ":reduction_dimension_grouper",
         ":reduction_layout_normalizer",
         ":target_constants",
+        ":tree_reduction_rewriter",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
         "//tensorflow/compiler/xla/service:hlo",
diff --git a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
index 0e2e27ee9a3..97013804271 100644
--- a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.h"
 #include "tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.h"
 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
+#include "tensorflow/compiler/xla/service/gpu/tree_reduction_rewriter.h"
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index f61ccd77c86..a1a901f0b94 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -55,6 +55,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cuda_libdevice_path.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
 
@@ -151,6 +152,16 @@ Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
   return Status::OK();
 }
 
+// TODO(cheshire): Duplication with gpu_conv_algorithm picker, figure out a
+// right way to share this.
+static bool RequireDeterminism() {
+  bool deterministic_ops = false;
+  TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DETERMINISTIC_OPS",
+                                             /*default_val=*/false,
+                                             &deterministic_ops));
+  return deterministic_ops;
+}
+
 Status NVPTXCompiler::OptimizeHloPostLayoutAssignment(
     HloModule* hlo_module, se::StreamExecutor* stream_exec,
     se::DeviceMemoryAllocator* device_allocator) {
@@ -172,7 +183,8 @@ Status NVPTXCompiler::OptimizeHloPostLayoutAssignment(
   options.set_is_layout_sensitive(true);
   pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
 
-  if (hlo_module->config().debug_options().xla_gpu_deterministic_reductions()) {
+  if (RequireDeterminism() ||
+      hlo_module->config().debug_options().xla_gpu_deterministic_reductions()) {
     pipeline.AddPass<HloPassFix<GpuTreeReductionRewriter>>();
   }
 

From 7ad1eb110f1966f6197f96f9e3b084137c350231 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Wed, 19 Feb 2020 17:25:04 -0800
Subject: [PATCH 311/442] NFC: Add a TODO to move HLO Relu legalizations to TF
 to TF lowering PiperOrigin-RevId: 296095163 Change-Id:
 Ic0b8d26c11c64e6584eef0da38b87e71d2dd03e8

---
 .../compiler/mlir/xla/transforms/legalize_tf_patterns.td       | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index a78d9cc2d2d..872a288c259 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -368,6 +368,9 @@ def : Pat<(TF_ConstOp:$res ElementsAttr:$value), (HLO_ConstOp $value),
 // Relu op patterns.
 //===----------------------------------------------------------------------===//
 
+// TODO(hinsu): Make these patterns to TF to TF lowering. Relu6 lowering will
+// require HLO canonicalization of min and max on a tensor to ClampOp.
+
 // TODO(hinsu): Lower unsinged and quantized types after supporting
 // them in GetScalarOfType.
 def : Pat<(TF_ReluOp AnyRankedTensor:$input),

From dea81b04c311cc9e420217d201ec4fabef5963d6 Mon Sep 17 00:00:00 2001
From: Juhyun Lee <impjdi@google.com>
Date: Wed, 19 Feb 2020 17:25:42 -0800
Subject: [PATCH 312/442] Remove duplicate error check in RESHAPE.

PiperOrigin-RevId: 296095268
Change-Id: I22c50cf2fb8cb1343c8c99f9cf557aad45aff15c
---
 tensorflow/lite/delegates/gpu/gl/kernels/reshape.cc | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/reshape.cc b/tensorflow/lite/delegates/gpu/gl/kernels/reshape.cc
index 5a0b6d7e3c3..cd01417cff5 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/reshape.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/reshape.cc
@@ -38,14 +38,11 @@ class Reshape : public NodeShader {
     auto output = ctx.graph->FindOutputs(ctx.node->id)[0];
     if (input->tensor.shape.DimensionsProduct() !=
         output->tensor.shape.DimensionsProduct()) {
-      return InvalidArgumentError("Dimensions product is reshape don't match");
+      return InvalidArgumentError(
+          "Number of elements in input & output tensors don't match.");
     }
     auto attr =
         absl::any_cast<ReshapeAttributes>(ctx.node->operation.attributes);
-    if (input->tensor.shape.DimensionsProduct() !=
-        output->tensor.shape.DimensionsProduct()) {
-      return InvalidArgumentError("Dimensions product is reshape don't match");
-    }
     if (attr.new_shape != output->tensor.shape) {
       return InvalidArgumentError(
           "Dimensions for output does not match new_shape attribute");

From 1e4f7195a8e35ccf9edb72e1d90e06c203b99faa Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Wed, 19 Feb 2020 17:27:34 -0800
Subject: [PATCH 313/442] Use JoinPath over a fixed string for building paths.

The fixed path doesn't work well on Windows when the correct path separator is used.

PiperOrigin-RevId: 296095586
Change-Id: I9fe0459ef58a310bf471cf2548b3f7e23b764502
---
 tensorflow/core/platform/resource_loader_test.cc | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/platform/resource_loader_test.cc b/tensorflow/core/platform/resource_loader_test.cc
index 590eb889c13..75bdca19452 100644
--- a/tensorflow/core/platform/resource_loader_test.cc
+++ b/tensorflow/core/platform/resource_loader_test.cc
@@ -17,17 +17,22 @@ limitations under the License.
 
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
+namespace {
 
-const char kDataDependencyPath[] = "tensorflow/core/platform/resource_loader.h";
+string DataDependencyPath() {
+  return io::JoinPath("tensorflow", "core", "platform", "resource_loader.h");
+}
 
 TEST(ResourceLoaderTest, FindsAndOpensFile) {
-  string filepath = GetDataDependencyFilepath(kDataDependencyPath);
+  string filepath = GetDataDependencyFilepath(DataDependencyPath());
   Status s = Env::Default()->FileExists(filepath);
   EXPECT_TRUE(s.ok()) << "No file found at this location: " << filepath;
 }
 
+}  // namespace
 }  // namespace tensorflow

From ccfc7fd53103fd44138e1c526859f2d7f3814557 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Wed, 19 Feb 2020 17:31:02 -0800
Subject: [PATCH 314/442] Retire agg_small_grads_max_bytes,
 agg_small_grads_max_group

agg_small_grads_max_bytes and agg_small_grads_max_group aren't effective to public users. They're not exposed in the public API, so they're always their default value (agg_small_grads_max_bytes=0), which are then ignored.

PiperOrigin-RevId: 296096179
Change-Id: Id7397539441a0e34af5c76d994eb08028e289b6d
---
 .../python/distribute/cross_device_ops.py     | 107 +++---------------
 .../distribute/cross_device_ops_test.py       |  19 +---
 2 files changed, 20 insertions(+), 106 deletions(-)

diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index 7f6230e9404..ba8f7542712 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -589,58 +589,11 @@ class _ConcatAndSplitPacker(object):
     return aggregated_device_grads
 
 
-class _AggregateSmallTensorPacker(object):
-  """Concatenate small gradient tensors together for reduction."""
-
-  def __init__(self,
-               agg_small_grads_max_bytes=1048576,
-               agg_small_grads_max_group=16):
-    """Initialize the _AggregateSmallTensorPacker object.
-
-    Args:
-      agg_small_grads_max_bytes: largest tensor eligible for aggregation,
-        in number of bytes.
-      agg_small_grads_max_group: largest permitted aggregation of small
-        tensors.
-
-    Raises:
-      ValueError: if `agg_small_grads_max_bytes` or `agg_small_grads_max_group`
-        is not greater than 0.
-    """
-    if agg_small_grads_max_bytes <= 0 or agg_small_grads_max_group <= 0:
-      raise ValueError("agg_small_grads_max_bytes and agg_small_grads_max_group"
-                       " should both be greater than zero.")
-    self.agg_small_grads_max_bytes = agg_small_grads_max_bytes
-    self.agg_small_grads_max_group = agg_small_grads_max_group
-
-  def pack(self, grouped_grads_and_vars):
-    """Aggregate small tensors."""
-    if (self.agg_small_grads_max_bytes > 0 and
-        self.agg_small_grads_max_group > 0):
-      device_grads, self.packing = cross_device_utils.pack_small_tensors(
-          grouped_grads_and_vars,
-          max_bytes=self.agg_small_grads_max_bytes,
-          max_group=self.agg_small_grads_max_group)
-    return device_grads
-
-  def unpack(self, summed_device_grad_packs):
-    """Reverse the aggregation process."""
-    return cross_device_utils.unpack_small_tensors(summed_device_grad_packs,
-                                                   self.packing)
-
-
-def _pack_tensors(device_grads,
-                  num_packs=0,
-                  agg_small_grads_max_bytes=0,
-                  agg_small_grads_max_group=0):
+def _pack_tensors(device_grads, num_packs=0):
   """Pack tensors if specified."""
   if num_packs > 0:
     tensor_packer = _ConcatAndSplitPacker(num_packs)
     device_grad_packs = tensor_packer.pack(device_grads)
-  elif agg_small_grads_max_bytes > 0 and agg_small_grads_max_group > 0:
-    tensor_packer = _AggregateSmallTensorPacker(agg_small_grads_max_bytes,
-                                                agg_small_grads_max_group)
-    device_grad_packs = tensor_packer.pack(device_grads)
   else:
     tensor_packer = None
     device_grad_packs = device_grads
@@ -657,34 +610,19 @@ def _unpack_tensors(reduced, tensor_packer=None):
 class AllReduceCrossDeviceOps(CrossDeviceOps):
   """Reduction using all-reduce."""
 
-  def __init__(self,
-               all_reduce_alg="nccl",
-               num_packs=1,
-               agg_small_grads_max_bytes=0,
-               agg_small_grads_max_group=10):
+  def __init__(self, all_reduce_alg="nccl", num_packs=1):
     """All-reduce implementation of CrossDeviceOps.
 
-    Before performing all-reduce, tensors will be repacked or aggregated for
-    more efficient cross-device transportation:
-      1) If `num_packs` is non-zero, pack values into
-        `num_packs` splits.
-      2) Otherwise, if `agg_small_grads_max_bytes` > 0 and
-        `agg_small_grads_max_group` > 0, aggregate values smaller than
-        `agg_small_grads_max_bytes` into groups with at most
-        `agg_small_grads_max_group` values.
-      3) Otherwise, no repacking or grouping will happen.
+    Before performing all-reduce, tensors will be packed for more efficient
+    cross-device transportation.
 
     Args:
       all_reduce_alg: the all-reduce algorithm to use, currently only "nccl" or
         "hierarchical_copy" are supported.
-      num_packs: see above.
-      agg_small_grads_max_bytes: see above.
-      agg_small_grads_max_group: see above.
+      num_packs: If non-zero, pack values into `num_packs` splits.
     """
     self._all_reduce_alg = all_reduce_alg
     self._num_packs = num_packs
-    self._agg_small_grads_max_bytes = agg_small_grads_max_bytes
-    self._agg_small_grads_max_group = agg_small_grads_max_group
     self._simple_cross_replica_ops = ReductionToOneDevice()
     super(AllReduceCrossDeviceOps, self).__init__()
 
@@ -724,18 +662,14 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
   def _do_batch_all_reduce(self, reduce_op, dense_values):
     """Run batch all-reduces."""
     logging.log_first_n(
-        logging.INFO, "batch_all_reduce: %d all-reduces with algorithm = %s, "
-        "num_packs = %d, agg_small_grads_max_bytes = %d and "
-        "agg_small_grads_max_group = %d" %
-        (len(dense_values), self._all_reduce_alg, self._num_packs,
-         self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10)
+        logging.INFO,
+        "batch_all_reduce: %d all-reduces with algorithm = %s, num_packs = %d" %
+        (len(dense_values), self._all_reduce_alg, self._num_packs), 10)
 
     destinations = dense_values[0]._devices  # pylint: disable=protected-access
     grouped = _group_value_by_device(dense_values)
 
-    device_grad_packs, tensor_packer = _pack_tensors(
-        grouped, self._num_packs, self._agg_small_grads_max_bytes,
-        self._agg_small_grads_max_group)
+    device_grad_packs, tensor_packer = _pack_tensors(grouped, self._num_packs)
 
     # The actual aggregation of the repacked gradients. Note that they are
     # sharded among different aggregation trees. So it is important to strike
@@ -839,9 +773,7 @@ class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
                worker_devices,
                num_gpus_per_worker,
                all_reduce_spec=("pscpu/pscpu", 2, -1),
-               num_packs=0,
-               agg_small_grads_max_bytes=0,
-               agg_small_grads_max_group=10):
+               num_packs=0):
     """Initialize the all-reduce algorithm.
 
     Args:
@@ -868,15 +800,10 @@ class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
         "pscpu/pscpu" algorithm. The third elements should be in increasing
         order across tuples and end with -1 which indicates infinity.
       num_packs: see AllReduceCrossDeviceOps.
-      agg_small_grads_max_bytes: see AllReduceCrossDeviceOps.
-      agg_small_grads_max_group: see AllReduceCrossDeviceOps.
     """
     self._worker_devices = worker_devices
     self._num_gpus_per_worker = num_gpus_per_worker
-    super(MultiWorkerAllReduce, self).__init__(
-        num_packs=num_packs,
-        agg_small_grads_max_bytes=agg_small_grads_max_bytes,
-        agg_small_grads_max_group=agg_small_grads_max_group)
+    super(MultiWorkerAllReduce, self).__init__(num_packs=num_packs)
 
     def validate_and_complete_spec(spec):
       """Validate and complete the all-reduce spec."""
@@ -907,12 +834,9 @@ class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
   def _batch_all_reduce(self, reduce_op, per_replica_values):
     """All-reduce algorithm in a batch."""
     logging.log_first_n(
-        logging.INFO,
-        "Distributed batch_all_reduce: %d all-reduces with "
-        "allreduce_spec = %r, num_packs = %d, agg_small_grads_max_bytes = %d, "
-        "and agg_small_grads_max_group = %d" %
-        (len(per_replica_values), self._all_reduce_spec, self._num_packs,
-         self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10)
+        logging.INFO, "Distributed batch_all_reduce: %d all-reduces with "
+        "allreduce_spec = %r, num_packs = %d" %
+        (len(per_replica_values), self._all_reduce_spec, self._num_packs), 10)
 
     device_grads = _group_value_by_device(per_replica_values)
 
@@ -935,8 +859,7 @@ class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
             spec_tuple.limit, remaining_grads)
       if this_grads:
         device_grad_packs, tensor_packer = _pack_tensors(
-            this_grads, self._num_packs, self._agg_small_grads_max_bytes,
-            self._agg_small_grads_max_group)
+            this_grads, self._num_packs)
         range_agg_grads = cross_device_utils.sum_gradients_all_reduce(
             self._worker_devices, device_grad_packs, len(self._worker_devices),
             spec_tuple.alg, spec_tuple.shards, range(self._num_gpus_per_worker))
diff --git a/tensorflow/python/distribute/cross_device_ops_test.py b/tensorflow/python/distribute/cross_device_ops_test.py
index b60809fd3b5..c91ec38bfd1 100644
--- a/tensorflow/python/distribute/cross_device_ops_test.py
+++ b/tensorflow/python/distribute/cross_device_ops_test.py
@@ -284,19 +284,15 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
       cross_device_ops=[
           combinations.NamedObject(
               "AllReduce",
-              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)),
+              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1)),
           combinations.NamedObject(
               "AllReduceNoGradientRepacking",
-              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)),
+              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0)),
           combinations.NamedObject("NcclAllReduce",
                                    cross_device_ops_lib.NcclAllReduce()),
           combinations.NamedObject(
               "HierarchicalCopy",
               cross_device_ops_lib.HierarchicalCopyAllReduce(8)),
-          combinations.NamedObject(
-              "HierarchicalCopyAggregateSmallTensors",
-              cross_device_ops_lib.AllReduceCrossDeviceOps(
-                  "hierarchical_copy", 0, 100, 10))
       ],
       devices=[
           ["/gpu:0", "/gpu:1"],
@@ -397,22 +393,17 @@ class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase,
               "MultiWorkerAllReduce",
               cross_device_ops_lib.MultiWorkerAllReduce(worker_devices, 2,
                                                         ("pscpu/pscpu", 2, -1),
-                                                        0, 0, 0)),
+                                                        0)),
           combinations.NamedObject(
               "MultiWorkerAllReducePack",
               cross_device_ops_lib.MultiWorkerAllReduce(worker_devices, 2,
                                                         ("pscpu/pscpu", 2, -1),
-                                                        1, 0, 0)),
-          combinations.NamedObject(
-              "MultiWorkerAllReduceAggregation",
-              cross_device_ops_lib.MultiWorkerAllReduce(worker_devices, 2,
-                                                        ("pscpu/pscpu", 2, -1),
-                                                        0, 100, 10)),
+                                                        1)),
           combinations.NamedObject(
               "MultiWorkerAllReduceMultipleSpecs",
               cross_device_ops_lib.MultiWorkerAllReduce(
                   worker_devices, 2, [("pscpu/pscpu", 2, 100),
-                                      ("xring", 2, -1)], 0, 0, 0)),
+                                      ("xring", 2, -1)], 0)),
       ],
       devices=[
           [

From 1f5bc8a9799ec226c059d257a7817b738ab515d4 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Wed, 19 Feb 2020 17:31:04 -0800
Subject: [PATCH 315/442] Add an experimental eager C API for generically
 fetching and setting op attributes.

Right now you can only fetch the whole attribute map and set it wholesale, but we can add more fine-grained attribute control in the future.

This allows the custom device API to pass in attributes, and custom devices to forward these to their own TFE_Execute calls. This is required for creating variables.

PiperOrigin-RevId: 296096192
Change-Id: I98c23bdcd13e479235b3e27850b1bb0bd7a53bba
---
 tensorflow/c/eager/c_api.cc              | 25 ++++---
 tensorflow/c/eager/c_api_experimental.h  | 28 ++++++--
 tensorflow/c/eager/c_api_internal.h      |  9 +++
 tensorflow/c/eager/c_api_test.cc         | 34 +++++++++
 tensorflow/c/eager/custom_device_test.cc | 90 +++++++++++++++++++++++-
 5 files changed, 171 insertions(+), 15 deletions(-)

diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 1beca1eacb7..4fa6ed64a2f 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -1199,14 +1199,6 @@ TFE_TensorHandle* TFE_NewTensorHandleFromDeviceMemory(
     dimvec[i] = static_cast<tensorflow::int64>(dims[i]);
   }
 
-  if (dtype == TF_STRING || dtype == TF_RESOURCE ||
-      !tensorflow::DataTypeCanUseMemcpy(
-          static_cast<tensorflow::DataType>(dtype))) {
-    status->status = tensorflow::errors::InvalidArgument(
-        "Trying to create a tensor with a pointer to non-pod memory.");
-    deallocator(data, len, deallocator_arg);
-    return nullptr;
-  }
   // TODO(apassos) do we need to wrap the deallocator here to make sure to sync
   // the device?
   TF_ManagedBuffer* buf =
@@ -1680,6 +1672,19 @@ void TFE_ContextStartStep(TFE_Context* ctx) { ctx->context->StartStep(); }
 
 void TFE_ContextEndStep(TFE_Context* ctx) { ctx->context->EndStep(); }
 
+void TFE_OpGetAttrs(TFE_Op* op, TFE_OpAttrs* attrs) {
+  *attrs = TFE_OpAttrs(&op->operation.Attrs());
+}
+
+void TFE_OpAddAttrs(TFE_Op* op, const TFE_OpAttrs* attrs) {
+  tensorflow::AttrValueMap m;
+  attrs->attributes->FillAttrValueMap(&m);
+  tensorflow::AttrBuilder* destination = op->operation.MutableAttrs();
+  for (auto attribute : m) {
+    destination->Set(attribute.first, attribute.second);
+  }
+}
+
 namespace tensorflow {
 void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
                           const tensorflow::AttrValue& default_value,
@@ -1799,10 +1804,10 @@ class CustomDeviceAPI : public tensorflow::CustomDevice {
               op->Inputs()[i])});
     }
     std::vector<TFE_TensorHandle*> outputs(*num_retvals);
-    // TODO(allenl): figure out how to get attrs from EagerOperation
     TF_Status status;
+    TFE_OpAttrs attributes(&op->Attrs());
     device_.execute(inputs.size(), inputs.data(), op->Name().c_str(),
-                    num_retvals, outputs.data(), &status, info_);
+                    &attributes, num_retvals, outputs.data(), &status, info_);
     if (status.status.ok()) {
       for (int i = 0; i < *num_retvals; ++i) {
         retvals[i] = tensorflow::down_cast<tensorflow::TensorHandleInterface*>(
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index d2b632bc301..da27bc51360 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -424,7 +424,27 @@ TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandleFromDeviceMemory(
 TF_CAPI_EXPORT extern void TFE_HostAddressSpace(TFE_Context* ctx,
                                                 TF_Buffer* buf);
 
-#define TFE_CUSTOM_DEVICE_VERSION 0
+// APIs for generically dealing with op attributes (e.g. when forwarding them
+// through custom device implementations).
+//
+// TODO(allenl): Currently these are black boxes, but we should have some way to
+// inspect values. This would let people e.g. copy over most attributes and then
+// modify some based on their values.
+
+// A reference to an op's name -> attribute mapping
+typedef struct TFE_OpAttrs TFE_OpAttrs;
+
+// Fetch a struct with a reference to information about attributes of `op`.
+//
+// The `attrs` struct does not own any memory, and `op` must outlive it.
+TF_CAPI_EXPORT extern void TFE_OpGetAttrs(TFE_Op* op, TFE_OpAttrs* attrs);
+
+// Add attributes in `attrs` to `op`.
+//
+// Does not overwrite or update existing attributes, but adds new ones.
+TF_CAPI_EXPORT extern void TFE_OpAddAttrs(TFE_Op* op, const TFE_OpAttrs* attrs);
+
+#define TFE_CUSTOM_DEVICE_VERSION 1
 
 // Struct to be filled in
 typedef struct TFE_CustomDevice {
@@ -441,10 +461,10 @@ typedef struct TFE_CustomDevice {
                                                void* device_info);
 
   // Method to execute an operation.
-  // TODO(allenl) figure out a generic way of passing attrs here
   void (*execute)(int num_inputs, TFE_TensorHandle** inputs,
-                  const char* operation_name, int* num_outputs,
-                  TFE_TensorHandle** outputs, TF_Status* s, void* device_info);
+                  const char* operation_name, const TFE_OpAttrs* attributes,
+                  int* num_outputs, TFE_TensorHandle** outputs, TF_Status* s,
+                  void* device_info);
 
   // Method to delete a device.
   void (*delete_device)(void* device_info);
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index f4bdcc05489..01038a33549 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -236,4 +236,13 @@ struct TFE_Executor {
   tensorflow::EagerExecutor* unowned_executor;
 };
 
+struct TFE_OpAttrs {
+  explicit TFE_OpAttrs() : attributes(nullptr) {}
+
+  explicit TFE_OpAttrs(const tensorflow::AttrBuilder* value)
+      : attributes(value) {}
+
+  const tensorflow::AttrBuilder* attributes;
+};
+
 #endif  // TENSORFLOW_C_EAGER_C_API_INTERNAL_H_
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 9ae1e7b896b..91026a0650c 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -1449,4 +1449,38 @@ TEST(CAPI, TestTFE_OpGetInputAndOutputLengthsFailForUnknownArguments) {
   TFE_DeleteContext(ctx);
 }
 
+TEST(CAPI, TestTFE_OpGetAttrs) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_Op* varop = TFE_NewOp(ctx, "VarHandleOp", status);
+  TFE_OpSetAttrType(varop, "dtype", TF_INT64);
+  TFE_OpSetAttrShape(varop, "shape", {}, 0, status);
+  TFE_OpAttrs attributes;
+  TFE_OpGetAttrs(varop, &attributes);
+
+  TFE_Op* varop_copy = TFE_NewOp(ctx, "VarHandleOp", status);
+  TFE_OpSetAttrType(varop_copy, "dtype", TF_FLOAT);
+  TFE_OpAddAttrs(varop_copy, &attributes);
+  unsigned char is_list = 0;
+  ASSERT_EQ(TF_ATTR_TYPE,
+            TFE_OpGetAttrType(varop_copy, "dtype", &is_list, status));
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  ASSERT_EQ(TF_ATTR_SHAPE,
+            TFE_OpGetAttrType(varop_copy, "shape", &is_list, status));
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  tensorflow::AttrValueMap attr_values;
+  varop_copy->operation.Attrs().FillAttrValueMap(&attr_values);
+  EXPECT_EQ(tensorflow::DT_FLOAT, attr_values.find("dtype")->second.type());
+
+  TF_DeleteStatus(status);
+  TFE_DeleteOp(varop);
+  TFE_DeleteOp(varop_copy);
+  TFE_DeleteContext(ctx);
+}
+
 }  // namespace
diff --git a/tensorflow/c/eager/custom_device_test.cc b/tensorflow/c/eager/custom_device_test.cc
index 3a6f9d93164..742844c3f75 100644
--- a/tensorflow/c/eager/custom_device_test.cc
+++ b/tensorflow/c/eager/custom_device_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_experimental.h"
 #include "tensorflow/c/eager/c_api_test_util.h"
 #include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace {
@@ -83,12 +84,14 @@ TFE_TensorHandle* CopyTensorFromLoggingDevice(TFE_TensorHandle* tensor,
 }
 
 void LoggingDeviceExecute(int num_inputs, TFE_TensorHandle** inputs,
-                          const char* operation_name, int* num_outputs,
+                          const char* operation_name,
+                          const TFE_OpAttrs* attributes, int* num_outputs,
                           TFE_TensorHandle** outputs, TF_Status* s,
                           void* device_info) {
   LoggingDevice* dev = reinterpret_cast<LoggingDevice*>(device_info);
   TFE_Op* op(TFE_NewOp(dev->ctx, operation_name, s));
   if (TF_GetCode(s) != TF_OK) return;
+  TFE_OpAddAttrs(op, attributes);
   TFE_OpSetDevice(op, dev->underlying_device.c_str(), s);
   for (int j = 0; j < num_inputs; ++j) {
     TFE_TensorHandle* input = inputs[j];
@@ -203,4 +206,89 @@ TEST(CUSTOM_DEVICE, ResetOperation) {
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 }
 
+TEST(CUSTOM_DEVICE, MakeVariable) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  std::unique_ptr<TFE_ContextOptions, decltype(&TFE_DeleteContextOptions)> opts(
+      TFE_NewContextOptions(), TFE_DeleteContextOptions);
+  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> context(
+      TFE_NewContext(opts.get(), status.get()), TFE_DeleteContext);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  bool arrived = false;
+  bool executed = false;
+  const char* name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
+  RegisterLoggingDevice(context.get(), name, &arrived, &executed);
+
+  // Create a variable handle placed on the custom device.
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
+      TFE_NewOp(context.get(), "VarHandleOp", status.get()), TFE_DeleteOp);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  TFE_OpSetAttrType(op.get(), "dtype", TF_FLOAT);
+  TFE_OpSetAttrShape(op.get(), "shape", {}, 0, status.get());
+  TFE_OpSetAttrString(op.get(), "container", "", 0);
+  TFE_OpSetAttrString(op.get(), "shared_name", "", 0);
+  TFE_OpSetDevice(op.get(), name, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  TFE_TensorHandle* var_handle = nullptr;
+  int num_retvals = 1;
+  executed = false;
+  TFE_Execute(op.get(), &var_handle, &num_retvals, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  ASSERT_TRUE(executed);
+  auto handle_cleaner = tensorflow::gtl::MakeCleanup(
+      [var_handle]() { TFE_DeleteTensorHandle(var_handle); });
+
+  // Assign to the variable, copying to the custom device.
+  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)> one(
+      TestScalarTensorHandle(111.f), TFE_DeleteTensorHandle);
+  op.reset(TFE_NewOp(context.get(), "AssignVariableOp", status.get()));
+  TFE_OpSetAttrType(op.get(), "dtype", TF_FLOAT);
+  TFE_OpAddInput(op.get(), var_handle, status.get());
+  TFE_OpAddInput(op.get(), one.get(), status.get());
+  TFE_OpSetDevice(op.get(), name, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  executed = false;
+  num_retvals = 0;
+  TFE_Execute(op.get(), nullptr, &num_retvals, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  ASSERT_TRUE(executed);
+
+  // Read the variable's value.
+  op.reset(TFE_NewOp(context.get(), "ReadVariableOp", status.get()));
+  TFE_OpAddInput(op.get(), var_handle, status.get());
+  TFE_OpSetDevice(op.get(), name, status.get());
+  TFE_OpSetAttrType(op.get(), "dtype", TF_FLOAT);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  executed = false;
+  num_retvals = 1;
+  TFE_TensorHandle* var_value = nullptr;
+  TFE_Execute(op.get(), &var_value, &num_retvals, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  ASSERT_TRUE(executed);
+  auto value_cleaner = tensorflow::gtl::MakeCleanup(
+      [var_value]() { TFE_DeleteTensorHandle(var_value); });
+  ASSERT_EQ(tensorflow::string(name),
+            tensorflow::string(
+                TFE_TensorHandleBackingDeviceName(var_value, status.get())));
+  TFE_TensorHandle* var_value_unpacked =
+      reinterpret_cast<LoggedTensor*>(
+          TFE_TensorHandleDevicePointer(var_value, status.get()))
+          ->tensor;
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> resolved_value(
+      TFE_TensorHandleResolve(var_value_unpacked, status.get()),
+      TF_DeleteTensor);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  ASSERT_EQ(111., *static_cast<float*>(TF_TensorData(resolved_value.get())));
+
+  // Free the backing buffer for the variable.
+  op.reset(TFE_NewOp(context.get(), "DestroyResourceOp", status.get()));
+  TFE_OpAddInput(op.get(), var_handle, status.get());
+  TFE_OpSetDevice(op.get(), name, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  num_retvals = 0;
+  TFE_Execute(op.get(), nullptr, &num_retvals, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+}
+
 }  // namespace

From 2db072417194a7e674757af17a19fcf5d86b8f83 Mon Sep 17 00:00:00 2001
From: Youlong Cheng <ylc@google.com>
Date: Wed, 19 Feb 2020 17:52:05 -0800
Subject: [PATCH 316/442] Allow user to pass input_shape to split.

PiperOrigin-RevId: 296099673
Change-Id: I2ea990d2e91a991fb1a89cf7ec5f1c749caaec5b
---
 .../experimental/xla_sharding/xla_sharding.py  | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
index ded290a234d..b89bfd68073 100644
--- a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
+++ b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
@@ -90,7 +90,7 @@ class Sharding(object):
             tile_assignment_devices=list(flattened_devices)))
 
   @classmethod
-  def split(cls, tensor, split_dimension, num_devices):
+  def split(cls, tensor, split_dimension, num_devices, input_shape=None):
     """Returns a Sharding that splits a tensor across a dimension.
 
     This creates a Tiled attribute, similar to tile(), but easier to use for the
@@ -100,12 +100,16 @@ class Sharding(object):
       tensor: A tf.Tensor to split.
       split_dimension: The dimension number to split.
       num_devices: The number of cores to split `tensor` over.
+      input_shape: The shape of the original tensor.
 
     Raises:
       ValueError: The tensor to split was smaller in the split dimension than
         the number of devices to split over.
     """
-    shape = tensor.shape.as_list()
+    if input_shape:
+      shape = input_shape
+    else:
+      shape = tensor.shape.as_list()
     if (shape[split_dimension] is not None and
         shape[split_dimension] < num_devices):
       raise ValueError('Split dimension was smaller than the required number '
@@ -221,7 +225,8 @@ def split(tensor,
           split_dimension,
           num_devices,
           assign_tuple_sharding=False,
-          use_sharding_op=False):
+          use_sharding_op=False,
+          input_shape=None):
   """Returns a tensor that is split along the given dimension.
 
   Args:
@@ -230,10 +235,11 @@ def split(tensor,
     num_devices: The number of devices to partition the dimension.
     assign_tuple_sharding: If the sharding type should be a tuple.
     use_sharding_op: If true, adds a sharding op to set the sharding.
+    input_shape: The full shape of the input tensor.
   """
   if use_sharding_op:
     tensor = tf2xla.sharding(tensor)
-  Sharding.split(tensor, split_dimension, num_devices).apply_to_tensor(
-      tensor,
-      assign_tuple_sharding=assign_tuple_sharding)
+  Sharding.split(
+      tensor, split_dimension, num_devices, input_shape).apply_to_tensor(
+          tensor, assign_tuple_sharding=assign_tuple_sharding)
   return tensor

From 31679b0d8440d2f119a2dc060b7d04fe77111bda Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 18:06:16 -0800
Subject: [PATCH 317/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 296102169 Change-Id:
 I07271901a4d49284d377b3d0da6fbb5cb1aeef27

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index ecdce1e627b..449a95765a5 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21345,7 +21345,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22053,7 +22053,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22249,7 +22249,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22318,7 +22318,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22433,7 +22433,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22492,7 +22492,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22666,7 +22666,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22857,7 +22857,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25297,7 +25297,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25629,7 +25629,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25679,7 +25679,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25929,7 +25929,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26559,7 +26559,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27624,7 +27624,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45536,7 +45536,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 1612c983e5b40103e6d9d65ebab92c18264dd399 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 18:13:53 -0800
Subject: [PATCH 318/442] Fix some recommendations in the Profiler Overview
 Page.

PiperOrigin-RevId: 296103334
Change-Id: Ia9a0aff4044d77b8e8f91bc4a953325b47b28c9d
---
 .../profiler/convert/op_stats_to_overview_page.cc   | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
index fa221e5524f..06fd60798dc 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -77,16 +77,21 @@ void ComputeDeviceTips(HardwareType hardware_type,
   const string& device_name = HardwareType_Name(hardware_type);
   string timeline_name =
       (hardware_type == tensorflow::profiler::TPU) ? "TPU core" : device_name;
-  *re->add_device_tips() = MakeOverviewPageTip(absl::StrCat(
-      "op_profile (identify the time-consuming operations executed on the ",
-      device_name, ")"));
+  string op_stats_toolname = (hardware_type == tensorflow::profiler::TPU)
+                                 ? "op_profile"
+                                 : "tensorflow_stats";
+  *re->add_device_tips() = MakeOverviewPageTip(
+      absl::StrCat(op_stats_toolname,
+                   " (identify the time-consuming operations "
+                   "executed on the ",
+                   device_name, ")"));
   *re->add_device_tips() = MakeOverviewPageTip(absl::StrCat(
       "trace_viewer (look at the activities on the timeline of each ",
       timeline_name, " in the trace view)"));
 }
 
 void ComputeFaqTips(OverviewPageRecommendation* re) {
-  *re->add_faq_tips() = MakeOverviewPageTip("Refer to the Cloud tools FAQ");
+  *re->add_faq_tips() = MakeOverviewPageTip("Refer to the TF2 Profiler FAQ");
 }
 
 void ComputeDocumentationTips(OverviewPageRecommendation* re) {

From 9aac700d028c35efacc00afdb8ff6ded15535a9b Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Wed, 19 Feb 2020 18:20:38 -0800
Subject: [PATCH 319/442] Use GetTestUndeclaredOutputsDir to access
 TEST_UNDECLARED_OUTPUTS_DIR.

On Windows, Bazel populates environment variables with `/`s only. Changing path
manipulation logic to use `\` properly on Windows will conflict with this
behavior, requiring a layer of indirection to deal with Bazel.

PiperOrigin-RevId: 296104352
Change-Id: Ibaa19d0c4d231a15811232c63bcefc9d4931f88b
---
 tensorflow/compiler/xla/tests/literal_test_util.cc      | 6 ++----
 tensorflow/compiler/xla/tests/literal_test_util_test.cc | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index 4dd59cdca5d..bb82193ae33 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "tensorflow/compiler/xla/literal_comparison.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
@@ -30,10 +31,7 @@ void WriteLiteralToTempFile(const LiteralSlice& literal, const string& name) {
   // TEST_UNDECLARED_OUTPUTS_DIR.  This plays well with tools that inspect test
   // results, especially when they're run on remote machines.
   string outdir;
-  const char* undeclared_outputs_dir = getenv("TEST_UNDECLARED_OUTPUTS_DIR");
-  if (undeclared_outputs_dir != nullptr) {
-    outdir = undeclared_outputs_dir;
-  } else {
+  if (!tensorflow::io::GetTestUndeclaredOutputsDir(&outdir)) {
     outdir = tensorflow::testing::TmpDir();
   }
 
diff --git a/tensorflow/compiler/xla/tests/literal_test_util_test.cc b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
index 66373af5686..e2ad5a7e08f 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util_test.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
@@ -129,10 +130,7 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) {
   tensorflow::Env* env = tensorflow::Env::Default();
 
   string outdir;
-  const char* undeclared_outputs_dir = getenv("TEST_UNDECLARED_OUTPUTS_DIR");
-  if (undeclared_outputs_dir != nullptr) {
-    outdir = undeclared_outputs_dir;
-  } else {
+  if (!tensorflow::io::GetTestUndeclaredOutputsDir(&outdir)) {
     outdir = tensorflow::testing::TmpDir();
   }
   string pattern = tensorflow::io::JoinPath(outdir, "tempfile-*.pb");

From aadf705c858014a168d1a582accc81b7cc774d68 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 19 Feb 2020 18:28:23 -0800
Subject: [PATCH 320/442] [TF:MLIR] Add operation interface for folding
 operands transposes into the ops

PiperOrigin-RevId: 296105420
Change-Id: Ie8c54de100910f6eda53bf2d02b194a0a8785ec8
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 13 +++-
 .../mlir/tensorflow/ir/tf_op_interfaces.td    | 42 ++++++++++++-
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     | 59 ++++++++++++++++++
 .../mlir/tensorflow/ir/tf_verifiers.cc        | 28 ++++++---
 .../mlir/tensorflow/ir/tf_verifiers.h         |  6 ++
 ...yout_optimization_move_transposes_end.mlir | 25 ++++++++
 .../transforms/layout_optimization.cc         | 60 ++++++++++++++++---
 7 files changed, 211 insertions(+), 22 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 1d8dd178189..31e85ef247e 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -508,8 +508,8 @@ Broadcasting is supported, so `value` may have any number of dimensions.
 
   let extraClassDeclaration = [{
     // TF_LayoutSensitiveInterface:
-    SmallVector<int64_t, 4> GetLayoutDependentArgs() { return {0}; }
-    SmallVector<int64_t, 4> GetLayoutDependentResults() { return {0}; }
+    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
+    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {0}; }
   }];
 }
 
@@ -3675,7 +3675,7 @@ retained with length 1.
   >];
 }
 
-def TF_MaxPoolOp : TF_Op<"MaxPool", [NoSideEffect]> {
+def TF_MaxPoolOp : TF_Op<"MaxPool", [NoSideEffect, TF_FoldOperandsTransposeInterface]> {
   let summary = "Performs max pooling on the input.";
 
   let description = [{
@@ -3695,6 +3695,13 @@ def TF_MaxPoolOp : TF_Op<"MaxPool", [NoSideEffect]> {
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let extraClassDeclaration = [{
+    // TF_FoldOperandsTransposeInterface:
+    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
+    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {0}; }
+    LogicalResult FoldOperandsPermutation(ArrayRef<int64_t> permutation);
+  }];
 }
 
 def TF_MaxPoolGradOp : TF_Op<"MaxPoolGrad", [NoSideEffect]> {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
index b887f966cbd..8700247af43 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
@@ -44,11 +44,11 @@ def TF_LayoutSensitiveInterface : OpInterface<"LayoutSensitiveInterface"> {
     >,
     InterfaceMethod<
       [{Returns indices of layout dependent arguments.}],
-      "SmallVector<int64_t, 4>", "GetLayoutDependentArgs", (ins)
+      "SmallVector<unsigned, 4>", "GetLayoutDependentArgs", (ins)
     >,
     InterfaceMethod<
       [{Returns indices of layout dependent results.}],
-      "SmallVector<int64_t, 4>", "GetLayoutDependentResults", (ins)
+      "SmallVector<unsigned, 4>", "GetLayoutDependentResults", (ins)
     >,
   ];
 
@@ -57,4 +57,42 @@ def TF_LayoutSensitiveInterface : OpInterface<"LayoutSensitiveInterface"> {
   }];
 }
 
+def TF_FoldOperandsTransposeInterface : OpInterface<"FoldOperandsTransposeInterface"> {
+  let description = [{
+    Operation supports folding operand(s) transposes into the operation itself.
+
+    (1) Operation might have layout dependent operands and results...
+
+      Example:  MaxPool(Transpose($arg, $perm))
+                  -> Transpose(MaxPool($arg, $perm))
+
+    (2) ... or it might have only layout dependent operands:
+
+      Example: Mean(Transpose($arg, $reduction_dims))
+                 -> Mean($arg, Transpose($reduction_dims))
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      [{Returns indices of layout dependent arguments.}],
+      "SmallVector<unsigned, 4>", "GetLayoutDependentArgs", (ins)
+    >,
+    InterfaceMethod<
+      [{Returns indices of layout dependent results.}],
+      "SmallVector<unsigned, 4>", "GetLayoutDependentResults", (ins)
+    >,
+    InterfaceMethod<
+      [{Updates operation attributes and operands to account for the folded
+        permutation. If folding of permutation is not possible, must return
+        failure.}],
+      "LogicalResult", "FoldOperandsPermutation",
+      (ins "ArrayRef<int64_t>":$permutation)
+    >,
+  ];
+
+  let verify = [{
+    return VerifyFoldOperandsTransposeInterface($_op);
+  }];
+}
+
 #endif // TF_OP_INTERFACES
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index c97f2ed5420..57e16d91d69 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -1350,6 +1350,65 @@ void MaxOp::build(Builder *builder, OperationState &result, Value input,
   build(builder, result, out_ty, input, reduction_indices, keep_dims);
 }
 
+//===----------------------------------------------------------------------===//
+// MaxPoolOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult MaxPoolOp::FoldOperandsPermutation(
+    ArrayRef<int64_t> permutation) {
+  MLIRContext *context = getParentOfType<ModuleOp>().getContext();
+
+  // For now we only support folding of NCHW->NHWC and NHWC->NCHW permutations.
+  if (data_format() == "NHWC") {
+    static constexpr std::array<int64_t, 4> kPerm = {0, 2, 3, 1};  // to NHWC
+    if (permutation != ArrayRef<int64_t>(kPerm)) return failure();
+
+    setAttr("data_format", StringAttr::get("NCHW", context));
+
+  } else if (data_format() == "NCHW") {
+    static constexpr std::array<int64_t, 4> kPerm = {0, 3, 1, 2};  // to NCHW
+    if (permutation != ArrayRef<int64_t>(kPerm)) return failure();
+
+    setAttr("data_format", StringAttr::get("NHWC", context));
+
+  } else {
+    return failure();
+  }
+
+  auto shuffle_attr = [&](ArrayAttr attr) -> ArrayAttr {
+    SmallVector<Attribute, 4> values{attr.begin(), attr.end()};
+    SmallVector<Attribute, 4> shuffled(values.size());
+
+    for (size_t i = 0; i < permutation.size(); ++i)
+      shuffled[permutation[i]] = values[i];
+
+    return ArrayAttr::get(shuffled, context);
+  };
+
+  setAttr("strides", shuffle_attr(strides()));
+  setAttr("ksize", shuffle_attr(ksize()));
+
+  auto shuffle_type = [&](Type type) -> Type {
+    if (auto ranked_type = type.dyn_cast<RankedTensorType>()) {
+      ArrayRef<int64_t> shape = ranked_type.getShape();
+      assert(permutation.size() == shape.size());
+
+      SmallVector<int64_t, 4> new_shape(permutation.size());
+      for (size_t i = 0; i < permutation.size(); ++i)
+        new_shape[permutation[i]] = shape[i];
+
+      return RankedTensorType::get(new_shape, ranked_type.getElementType());
+    }
+
+    return type;
+  };
+
+  OpResult result = getOperation()->getResult(0);
+  result.setType(shuffle_type(result.getType()));
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // MaxPoolGradOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.cc
index 379797c99e4..247df44a90a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.cc
@@ -21,23 +21,35 @@ limitations under the License.
 namespace mlir {
 namespace TF {
 
-LogicalResult VerifyLayoutSensitiveInterface(Operation* op) {
-  auto layout_sensitive_interface = cast<LayoutSensitiveInterface>(op);
+namespace {
 
-  if (!llvm::all_of(
-          layout_sensitive_interface.GetLayoutDependentArgs(),
-          [&](int64_t index) { return index < op->getNumOperands(); })) {
+template <typename Interface>
+LogicalResult VerifyLayoutDependentArgsAndResults(Operation* op,
+                                                  Interface interface) {
+  auto valid_operand = [&](int64_t idx) { return idx < op->getNumOperands(); };
+  if (!llvm::all_of(interface.GetLayoutDependentArgs(), valid_operand)) {
     return op->emitOpError("layout dependent argument index is out of bound");
   }
 
-  if (!llvm::all_of(
-          layout_sensitive_interface.GetLayoutDependentResults(),
-          [&](int64_t index) { return index < op->getNumResults(); })) {
+  auto valid_result = [&](int64_t idx) { return idx < op->getNumResults(); };
+  if (!llvm::all_of(interface.GetLayoutDependentResults(), valid_result)) {
     return op->emitOpError("layout dependent result index is out of bound");
   }
 
   return success();
 }
 
+}  // namespace
+
+LogicalResult VerifyLayoutSensitiveInterface(Operation* op) {
+  auto layout_sensitive_interface = cast<LayoutSensitiveInterface>(op);
+  return VerifyLayoutDependentArgsAndResults(op, layout_sensitive_interface);
+}
+
+LogicalResult VerifyFoldOperandsTransposeInterface(Operation* op) {
+  auto fold_operands_transpose = cast<FoldOperandsTransposeInterface>(op);
+  return VerifyLayoutDependentArgsAndResults(op, fold_operands_transpose);
+}
+
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h
index 776f0a9022a..5289328e73f 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h
@@ -29,6 +29,12 @@ namespace TF {
 //     [0, getNumOperands/getNumResults) range.
 LogicalResult VerifyLayoutSensitiveInterface(Operation* op);
 
+// Verifies correctness of ops implementing FoldOperandsTransposeInterface (see
+// definition in tf_op_base.td):
+// (1) Layout dependent arguments and results indices must be in
+//     [0, getNumOperands/getNumResults) range.
+LogicalResult VerifyFoldOperandsTransposeInterface(Operation* op);
+
 }  // namespace TF
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir
index 7c54bdb3889..10fc70683b3 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir
@@ -47,3 +47,28 @@ func @move_across_multi_operand_op(%arg0: tensor<1x4x4x8xf32>, %arg1: tensor<1x4
 
   return %3 : tensor<1x8x4x4xf32>
 }
+
+// CHECK-LABEL: func @fold_into_max_pool
+func @fold_into_max_pool(%arg0: tensor<1x64x112x112xf32>) -> tensor<1x56x56x64xf32> {
+
+  // MaxPool operand transpose must be folded into the op and MaxPool
+  // must use NCHW data format with updated kernel size and strides.
+
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
+  // CHECK: %[[MAX_POOL:[0-9]*]] = "tf.MaxPool"(%arg0) {data_format = "NCHW", ksize = [1, 1, 3, 3], padding = "SAME", strides = [1, 1, 2, 2]} : (tensor<1x64x112x112xf32>) -> tensor<1x64x56x56xf32>
+  // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[ADD]], %[[RES_PERM]])
+  // CHECK: return %[[RES_TRANSPOSE]]
+
+  // Transpose NCHW -> NHWC
+  %0 = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>} : () -> tensor<4xi64>
+  %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x64x112x112xf32>, tensor<4xi64>) -> tensor<1x112x112x64xf32>
+
+  // Compute MaxPool in NHWC format
+  %2 = "tf.MaxPool"(%1)
+       {
+         data_format = "NHWC", ksize = [1, 3, 3, 1],
+         padding = "SAME", strides = [1, 2, 2, 1]
+       } : (tensor<1x112x112x64xf32>) -> tensor<1x56x56x64xf32>
+
+  return %2 : tensor<1x56x56x64xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
index feef3516ade..d642b093e6b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
@@ -261,8 +261,25 @@ void MoveTransposeBefore(Operation* op, SmallVector<Operation*, 8>* work_list) {
 
 // Move Transpose operations that permute `op` operands after the `op`.
 void MoveTransposeAfter(Operation* op, SmallVector<Operation*, 8>* work_list) {
-  // TODO(ezhulenev): Move transpose across layout sensitive operations.
-  if (!op->hasTrait<OpTrait::TF::LayoutAgnostic>()) return;
+  // Indices of operands and results that depend on data layout.
+  SmallVector<unsigned, 4> layout_dependent_operands;
+  SmallVector<unsigned, 4> layout_dependent_results;
+
+  auto fold_operands = dyn_cast<FoldOperandsTransposeInterface>(op);
+  bool layout_agnostic = op->hasTrait<OpTrait::TF::LayoutAgnostic>();
+
+  if (fold_operands) {
+    layout_dependent_operands = fold_operands.GetLayoutDependentArgs();
+    layout_dependent_results = fold_operands.GetLayoutDependentResults();
+
+  } else if (layout_agnostic) {
+    // For layout agnostic operation (e.g. element wise operations) all operands
+    // and results must have the same data layout.
+    for (unsigned i = 0; i < op->getNumOperands(); ++i)
+      layout_dependent_operands.push_back(i);
+    for (unsigned i = 0; i < op->getNumResults(); ++i)
+      layout_dependent_results.push_back(i);
+  }
 
   // Transpose operations that are operands of the `op`.
   SmallVector<TransposeOp, 2> transpose_ops;
@@ -270,9 +287,11 @@ void MoveTransposeAfter(Operation* op, SmallVector<Operation*, 8>* work_list) {
   // Constant operation that defines permutation indices for operand transposes.
   ConstOp permutation_op;
 
-  // All operation operands must be transpose operations with the same
+  // Layout dependent operands must be transpose operations with the same
   // permutation indices.
-  for (OpOperand& operand : op->getOpOperands()) {
+  for (unsigned idx : layout_dependent_operands) {
+    OpOperand& operand = op->getOpOperand(idx);
+
     // Operand must be defined by a transpose op.
     TransposeOp transpose =
         dyn_cast_or_null<TransposeOp>(operand.get().getDefiningOp());
@@ -299,6 +318,22 @@ void MoveTransposeAfter(Operation* op, SmallVector<Operation*, 8>* work_list) {
   // Nothing to do here.
   if (!permutation_op) return;
 
+  // All results after transpose must preserve the original result type.
+  SmallVector<Type, 4> original_type(op->getNumResults());
+  for (unsigned idx : layout_dependent_results)
+    original_type[idx] = op->getResult(idx).getType();
+
+  // Check if we can fold transpose into the operation.
+  if (fold_operands) {
+    SmallVector<int64_t, 8> permutation;
+
+    auto attr = permutation_op.value().cast<DenseElementsAttr>();
+    for (auto value : attr.getIntValues())
+      permutation.push_back(value.getSExtValue());
+
+    if (failed(fold_operands.FoldOperandsPermutation(permutation))) return;
+  }
+
   // At this point we checked that we can safely move Transpose node after
   // `op`, bypass all operands transposes, and transpose op results.
   Location loc = op->getLoc();
@@ -306,19 +341,25 @@ void MoveTransposeAfter(Operation* op, SmallVector<Operation*, 8>* work_list) {
   // Move constant op defining result permutation to the beginning of the block.
   permutation_op.getOperation()->moveBefore(&op->getBlock()->front());
 
-  // Bypass Transpose nodes for all operands.
-  for (OpOperand& operand : op->getOpOperands()) {
+  // Bypass Transpose nodes for layout dependent operands.
+  for (unsigned idx : layout_dependent_operands) {
+    OpOperand& operand = op->getOpOperand(idx);
     TransposeOp transpose =
         dyn_cast<TransposeOp>(operand.get().getDefiningOp());
     operand.set(transpose.getOperand(0));
   }
 
-  // Maybe add Transpose nodes for all results (or reuse existing transposes).
+  // Maybe add Transpose nodes for layout dependent results
+  // (or reuse existing transposes).
   OpBuilder builder(op);
   builder.setInsertionPoint(op);
 
-  for (OpResult result : op->getResults()) {
-    result.setType(op->getOperand(0).getType());
+  for (unsigned idx : layout_dependent_results) {
+    OpResult result = op->getResult(idx);
+
+    // Forward operand type only for layout agnostic operations, operations with
+    // custom folding will update the result type in `FoldOperandsPermutation`.
+    if (layout_agnostic) result.setType(op->getOperand(0).getType());
 
     // Try to push transpose further down.
     for (Operation* user : result.getUsers()) work_list->push_back(user);
@@ -330,6 +371,7 @@ void MoveTransposeAfter(Operation* op, SmallVector<Operation*, 8>* work_list) {
       transpose.getOperation()->moveBefore(op->getNextNode());
       transpose.setOperand(0, result);
       transpose.setOperand(1, permutation_op);
+      transpose.getResult().setType(original_type[idx]);
     } else {
       transpose = builder.create<TransposeOp>(loc, result, permutation_op);
     }

From 87c225ef0e8b1eac47dac471c8b6307ebd1f79be Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <prakalps@google.com>
Date: Wed, 19 Feb 2020 18:58:08 -0800
Subject: [PATCH 321/442] Add verifier for HLO Iota op.

Also fixes a bug in tf.RandomShuffle legalization caught by verifier.

PiperOrigin-RevId: 296109247
Change-Id: Icea818f51a6eab91f65efb65aa07f9639d9704a6
---
 tensorflow/compiler/mlir/xla/ir/hlo_ops.cc       | 14 ++++++++++++++
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td       |  2 +-
 .../compiler/mlir/xla/tests/legalize-tf.mlir     |  2 +-
 tensorflow/compiler/mlir/xla/tests/ops.mlir      | 16 ++++++++++++++++
 .../compiler/mlir/xla/transforms/legalize_tf.cc  |  2 +-
 5 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index 23c25e7d0cd..481c12b42c2 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -202,6 +202,20 @@ OpFoldResult IotaOp::fold(ArrayRef<Attribute> operands) {
   return DenseIntElementsAttr::get(output_type, values);
 }
 
+static LogicalResult Verify(IotaOp op) {
+  auto shape = op.getType().cast<ShapedType>();
+  if (!shape.hasRank()) return success();
+
+  if (shape.getRank() == 0)
+    return op.emitOpError() << "does not support scalars.";
+
+  auto iota_dimension = op.iota_dimension().getSExtValue();
+  if (iota_dimension >= shape.getRank() || iota_dimension < 0)
+    return op.emitOpError() << "iota dimension cannot go beyond the output "
+                               "rank or be negative.";
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // AbsOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index e2cd42104b3..e9727798907 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -120,7 +120,7 @@ def HLO_ConstOp : HLO_Op<"constant", [NoSideEffect]>, BASE_HLO_ConstOp {
 def HLO_IotaOp : HLO_Op<"iota", [NoSideEffect]>, BASE_HLO_IotaOp {
   let arguments = (ins I64Attr:$iota_dimension);
 
-  let results = (outs HLO_Tensor:$output);
+  let results = (outs HLO_IntFpOrComplexTensor:$output);
 
   let hasFolder = 1;
 
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index 67f085ef9a0..d80722e2865 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -3308,7 +3308,7 @@ func @random_shuffle_1D_10240(%input: tensor<10240xf32>) -> tensor<10240xf32> {
 // CHECK-LABEL: @random_shuffle_3D
 // CHECK-SAME: [[INPUT:%.*]]: tensor<4x?x16xf32>
 func @random_shuffle_3D(%input: tensor<4x?x16xf32>) -> tensor<4x?x16xf32> {
-  // CHECK: [[INDICES:%.*]] = "xla_hlo.iota"() {iota_dimension = 4 : i64} : () -> tensor<4xi32>
+  // CHECK: [[INDICES:%.*]] = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<4xi32>
 
   // CHECK: [[RNG_SHAPE:%.*]] = xla_hlo.constant dense<4> : tensor<1xi64>
   // CHECK: [[RNG_LOWER:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
diff --git a/tensorflow/compiler/mlir/xla/tests/ops.mlir b/tensorflow/compiler/mlir/xla/tests/ops.mlir
index 3c91f1d7dd0..7e2845daa06 100644
--- a/tensorflow/compiler/mlir/xla/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/ops.mlir
@@ -292,6 +292,22 @@ func @infeed_non_token_second_result(%token: !xla_hlo.token) -> tuple<tuple<tens
 
 // -----
 
+func @iota_scalar() -> tensor<i32> {
+  // expected-error@+1 {{does not support scalars}}
+  %0 = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<i32>
+  return %0 : tensor<i32>
+}
+
+// -----
+
+func @iota_invalid_iota_dimension() -> tensor<4xi32> {
+  // expected-error@+1 {{iota dimension cannot go beyond the output rank or be negative}}
+  %0 = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<4xi32>
+  return %0 : tensor<4xi32>
+}
+
+// -----
+
 func @map_mismatched_args(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
   // expected-error@+1 {{expects number of operands to match the arity of map computation, but got: 2 and 1}}
   %0 = "xla_hlo.map"(%arg0, %arg1) ( {
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index 50ecce24df3..da135ea1860 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -3362,7 +3362,7 @@ class ConvertRandomShuffleOp : public OpRewritePattern<TF::RandomShuffleOp> {
     auto indices_type =
         RankedTensorType::get({first_dim_size}, rewriter.getIntegerType(32));
     Value indices = rewriter.create<xla_hlo::IotaOp>(
-        op.getLoc(), indices_type, rewriter.getI64IntegerAttr(first_dim_size));
+        op.getLoc(), indices_type, rewriter.getI64IntegerAttr(0));
 
     // Generate random numbers to be used as swaps for the indices.
     Value swaps = CreateRngUniform32(op.getLoc(), first_dim_size, 0,

From c2a17671b3999a224c147f1a238275d0d6a8cb56 Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Wed, 19 Feb 2020 19:19:46 -0800
Subject: [PATCH 322/442] Remove unnecessary TF_LITE_MICRO_TENSORS_PREPARED.

PiperOrigin-RevId: 296111850
Change-Id: Ie817bad07ad60b12ecd05aaa82de7f03b476972c
---
 tensorflow/lite/micro/tools/make/targets/xtensa_xpg_makefile.inc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/lite/micro/tools/make/targets/xtensa_xpg_makefile.inc b/tensorflow/lite/micro/tools/make/targets/xtensa_xpg_makefile.inc
index d9545fc2116..0ccad72692d 100644
--- a/tensorflow/lite/micro/tools/make/targets/xtensa_xpg_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/xtensa_xpg_makefile.inc
@@ -7,7 +7,6 @@ ifeq ($(TARGET), xtensa-xpg)
   TARGET_ARCH := xtensa-xpg
 
   PLATFORM_ARGS = \
-    -DTF_LITE_MICRO_TENSORS_PREPARED \
     -DTF_LITE_STATIC_MEMORY \
     -DTF_LITE_STRIP_ERROR_STRINGS \
     -DNDEBUG \

From ed4ca062fd0333cd55f109a4767cd101a3131f7a Mon Sep 17 00:00:00 2001
From: Pallavi G <pallavi.g@intel.com>
Date: Wed, 19 Feb 2020 13:29:40 +0800
Subject: [PATCH 323/442] Address the coding style issues due to clang-format
 version mismatch

---
 tensorflow/core/kernels/mkl_concat_op.cc | 15 ++++++++-------
 tensorflow/core/util/mkl_types.h         |  2 ++
 tensorflow/core/util/mkl_util.h          | 16 +++++-----------
 3 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index d0e5ba69560..3f2e2c17b54 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -184,12 +184,13 @@ class EigenConcatBaseOp : public OpKernel {
       const auto in = values[i];
       const bool in_is_scalar = TensorShapeUtils::IsScalar(input_shapes[i]);
       OP_REQUIRES(
-          c, (input_shapes[i].dims() == input_dims) ||
-                 (input_is_scalar && in_is_scalar),
+          c,
+          (input_shapes[i].dims() == input_dims) ||
+              (input_is_scalar && in_is_scalar),
           errors::InvalidArgument(
               "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
-              input_shape.DebugString(), " vs. shape[", i, "] = ",
-              input_shapes[i].DebugString()));
+              input_shape.DebugString(), " vs. shape[", i,
+              "] = ", input_shapes[i].DebugString()));
       if (in.NumElements() > 0) {
         int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
         inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
@@ -861,9 +862,9 @@ class MklConcatOp : public OpKernel {
         DCHECK(dst_tensor != nullptr) << "Output tensor pointer is NULL";
       }
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
-                         string(e.message) + ", in file " + string(__FILE__) +
-                         ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           context,
           errors::Aborted("Operation received an exception:", error_msg));
diff --git a/tensorflow/core/util/mkl_types.h b/tensorflow/core/util/mkl_types.h
index eede9b6087f..685e19d8d6c 100644
--- a/tensorflow/core/util/mkl_types.h
+++ b/tensorflow/core/util/mkl_types.h
@@ -110,6 +110,7 @@ namespace tensorflow {
 #define TENSOR_FORMAT MKL_TENSOR_FORMAT
 #define TENSOR_FORMAT_NHWC MKL_TENSOR_FORMAT_NHWC
 #define TENSOR_MAX_DIMS MKLDNN_MAX_NDIMS
+#define GET_USR_MEM_PRIM_DESC(src) src.GetUsrMemDesc()
 
 #else
 
@@ -205,6 +206,7 @@ namespace tensorflow {
 #define SUMMAND_MD summand_pd
 #define TENSOR_FORMAT TensorFormat
 #define TENSOR_FORMAT_NHWC FORMAT_NHWC
+#define GET_USR_MEM_PRIM_DESC(src) src.GetUsrMemPrimDesc()
 #endif  // ENABLE_MKLDNN_V1
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 5e5416ee645..a782e76547b 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -732,9 +732,9 @@ inline Status ConvertMklToTF(OpKernelContext* context,
     }
     return Status::OK();
   } catch (mkldnn::error& e) {
-    string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
-                       string(e.message) + ", in file " + string(__FILE__) +
-                       ":" + std::to_string(__LINE__);
+    string error_msg = "Status: " + std::to_string(e.status) +
+                       ", message: " + string(e.message) + ", in file " +
+                       string(__FILE__) + ":" + std::to_string(__LINE__);
     LOG(FATAL) << "Operation received an exception: " << error_msg;
   }
 }
@@ -1254,8 +1254,8 @@ inline Status CreateBlockedMemDescHelper(const memory::dims& dim,
   } catch (mkldnn::error& e) {
     return Status(error::Code::INTERNAL,
                   tensorflow::strings::StrCat(
-                      "Failed to create blocked memory descriptor.", "Status: ",
-                      e.status, ", message: ", e.message));
+                      "Failed to create blocked memory descriptor.",
+                      "Status: ", e.status, ", message: ", e.message));
   }
 #else
   // We have to construct memory descriptor in a C style. This is not at all
@@ -2162,12 +2162,6 @@ void execute_primitives(
 }
 #endif  // ENABLE_MKLDNN_V1
 
-#ifdef ENABLE_MKLDNN_V1
-#define GET_USR_MEM_PRIM_DESC(src) src.GetUsrMemDesc()
-#else
-#define GET_USR_MEM_PRIM_DESC(src) src.GetUsrMemPrimDesc()
-#endif  // ENABLE_MKLDNN_V1
-
 }  // namespace tensorflow
 #endif  // INTEL_MKL
 #endif  // TENSORFLOW_CORE_UTIL_MKL_UTIL_H_

From bfb33d2e828cde2c8aef9c62912d9bec2c830517 Mon Sep 17 00:00:00 2001
From: Revan Sopher <rsopher@google.com>
Date: Wed, 19 Feb 2020 19:29:57 -0800
Subject: [PATCH 324/442] Fix TPU nightly build script.

We can't pass a single string containing all the arguments, since it'll be taken as a single argument instead. Storing as an array allows us to safely expand.

PiperOrigin-RevId: 296112909
Change-Id: Ic22b58bd8e6f9bbeadcbec5e0a78c2ac2e122f9c
---
 .../release/ubuntu_16/tpu_py37_full/nonpip.sh | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh
index 40626ae21a6..9d5488a7236 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh
@@ -35,22 +35,24 @@ export TF2_BEHAVIOR=1
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
-tag_filters="tpu,requires-tpu,-no_tpu,-notpu,-no_oss,-no_oss_py37"
+tag_filters="tpu,-no_tpu,-notpu,-no_oss,-no_oss_py37"
 
-bazel_args="--config=opt \
+bazel_args=(
+  --config=opt \
   --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
   --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR=${TF2_BEHAVIOR} \
+  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --noincompatible_strict_action_env \
-  --build_tag_filters=${tag_filters} \
-  --test_tag_filters=${tag_filters} \
+  --build_tag_filters="${tag_filters}" \
+  --test_tag_filters="${tag_filters}" \
   --test_output=errors --verbose_failures=true --keep_going \
-  --test_arg=--tpu=${TPU_NAME} \
-  --test_arg=--zone=${TPU_ZONE} \
+  --test_arg=--tpu="${TPU_NAME}" \
+  --test_arg=--zone="${TPU_ZONE}" \
   --test_arg=--test_dir_base=gs://kokoro-tpu-testing/tempdir/ \
   --local_test_jobs=1 \
-  -- //tensorflow/... -//tensorflow/compiler/... -//tensorflow/lite/..."
+  -- //tensorflow/... -//tensorflow/compiler/... -//tensorflow/lite/...
+)
 
-bazel build "${bazel_args}"
+bazel build "${bazel_args[@]}"
 ctpu_up -s v2-8 -p tensorflow-testing-tpu
-bazel test "${bazel_args}"
+bazel test "${bazel_args[@]}"

From 6bec2792a771af0dea61828037332160d15595a6 Mon Sep 17 00:00:00 2001
From: Tiezhen WANG <wangtz@google.com>
Date: Wed, 19 Feb 2020 19:43:59 -0800
Subject: [PATCH 325/442] TFL: slightly speed up reference::Softmax by avoiding
 unnecessary float->double cast.

The original logic is a bit weird that the calculation is in double while the accumulator is in float. Also in general, beta doesn't have a huge significant figures.

PiperOrigin-RevId: 296114624
Change-Id: I4e43fb9606b7b3c9f352de46da5d36cc50d7897a
---
 tensorflow/lite/kernels/internal/reference/softmax.h | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/reference/softmax.h b/tensorflow/lite/kernels/internal/reference/softmax.h
index 790f4d28ddb..ac06d49000e 100644
--- a/tensorflow/lite/kernels/internal/reference/softmax.h
+++ b/tensorflow/lite/kernels/internal/reference/softmax.h
@@ -43,20 +43,18 @@ inline void Softmax(const SoftmaxParams& params,
       max = std::max(max, input_data[i * depth + c]);
     }
 
-    // TODO(b/148114827): Improve this code.
     // Compute sum.
     float sum = 0.f;
     for (int c = 0; c < depth; ++c) {
-      sum += std::exp(static_cast<double>(input_data[i * depth + c] - max) *
-                      params.beta);
+      sum += std::exp((input_data[i * depth + c] - max) *
+                      static_cast<float>(params.beta));
     }
 
     // Compute result.
     for (int c = 0; c < depth; ++c) {
-      output_data[i * depth + c] =
-          std::exp(static_cast<double>(input_data[i * depth + c] - max) *
-                   params.beta) /
-          static_cast<double>(sum);
+      output_data[i * depth + c] = std::exp((input_data[i * depth + c] - max) *
+                                            static_cast<float>(params.beta)) /
+                                   sum;
     }
   }
 }

From d317cb0b59929a9e0ce3f80423b80eb02d27f241 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Wed, 19 Feb 2020 20:20:19 -0800
Subject: [PATCH 326/442] Add aggregation to OptimizerV2.apply_gradients

This option allows post processing of all reduced gradients, without inheriting from optimizer.

PiperOrigin-RevId: 296118658
Change-Id: Ifb6884ec981b06eb70fe5ee9126ab9ac013550e9
---
 tensorflow/python/distribute/BUILD            |  31 +++++-
 tensorflow/python/distribute/combinations.py  |   8 ++
 .../custom_training_loop_optimizer_test.py    | 101 ++++++++++++++++++
 .../python/keras/optimizer_v2/optimizer_v2.py |  50 +++++++--
 .../keras/optimizer_v2/optimizer_v2_test.py   |  30 ++++++
 ...ensorflow.keras.optimizers.-adadelta.pbtxt |   2 +-
 ...tensorflow.keras.optimizers.-adagrad.pbtxt |   2 +-
 .../tensorflow.keras.optimizers.-adam.pbtxt   |   2 +-
 .../tensorflow.keras.optimizers.-adamax.pbtxt |   2 +-
 .../tensorflow.keras.optimizers.-ftrl.pbtxt   |   2 +-
 .../tensorflow.keras.optimizers.-nadam.pbtxt  |   2 +-
 ...nsorflow.keras.optimizers.-optimizer.pbtxt |   2 +-
 ...nsorflow.keras.optimizers.-r-m-sprop.pbtxt |   2 +-
 .../tensorflow.keras.optimizers.-s-g-d.pbtxt  |   2 +-
 ...ensorflow.keras.optimizers.-adadelta.pbtxt |   2 +-
 ...tensorflow.keras.optimizers.-adagrad.pbtxt |   2 +-
 .../tensorflow.keras.optimizers.-adam.pbtxt   |   2 +-
 .../tensorflow.keras.optimizers.-adamax.pbtxt |   2 +-
 .../tensorflow.keras.optimizers.-ftrl.pbtxt   |   2 +-
 .../tensorflow.keras.optimizers.-nadam.pbtxt  |   2 +-
 ...nsorflow.keras.optimizers.-optimizer.pbtxt |   2 +-
 ...nsorflow.keras.optimizers.-r-m-sprop.pbtxt |   2 +-
 .../tensorflow.keras.optimizers.-s-g-d.pbtxt  |   2 +-
 .../v2/tensorflow.optimizers.-adadelta.pbtxt  |   2 +-
 .../v2/tensorflow.optimizers.-adagrad.pbtxt   |   2 +-
 .../v2/tensorflow.optimizers.-adam.pbtxt      |   2 +-
 .../v2/tensorflow.optimizers.-adamax.pbtxt    |   2 +-
 .../v2/tensorflow.optimizers.-ftrl.pbtxt      |   2 +-
 .../v2/tensorflow.optimizers.-nadam.pbtxt     |   2 +-
 .../v2/tensorflow.optimizers.-optimizer.pbtxt |   2 +-
 .../v2/tensorflow.optimizers.-r-m-sprop.pbtxt |   2 +-
 .../v2/tensorflow.optimizers.-s-g-d.pbtxt     |   2 +-
 32 files changed, 237 insertions(+), 37 deletions(-)
 create mode 100644 tensorflow/python/distribute/custom_training_loop_optimizer_test.py

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 1ccb21cea17..461365b4b45 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -935,11 +935,9 @@ distribute_py_test(
     deps = [
         "//tensorflow/python:errors",
         "//tensorflow/python:variables",
-        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/keras",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -990,11 +988,36 @@ distribute_py_test(
         "multi_and_single_gpu",
     ],
     deps = [
+        ":combinations",
+        ":strategy_combinations",
         "//tensorflow/python:errors",
         "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/distribute:combinations",
-        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+distribute_py_test(
+    name = "custom_training_loop_optimizer_test",
+    srcs = ["custom_training_loop_optimizer_test.py"],
+    main = "custom_training_loop_optimizer_test.py",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    deps = [
+        ":combinations",
+        ":distribute_lib",
+        ":reduce_util",
+        ":strategy_combinations",
+        ":values",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/keras",
         "@absl_py//absl/testing:parameterized",
diff --git a/tensorflow/python/distribute/combinations.py b/tensorflow/python/distribute/combinations.py
index 80a185d1af5..5f6779911c4 100644
--- a/tensorflow/python/distribute/combinations.py
+++ b/tensorflow/python/distribute/combinations.py
@@ -204,6 +204,14 @@ class NamedDistribution(object):
     return self._name
 
 
+def concat(*combined):
+  """Concats combinations."""
+  result = []
+  for one in combined:
+    result += one
+  return result
+
+
 _defaults = framework_combinations.generate.keywords["test_combinations"]
 
 generate = functools.partial(
diff --git a/tensorflow/python/distribute/custom_training_loop_optimizer_test.py b/tensorflow/python/distribute/custom_training_loop_optimizer_test.py
new file mode 100644
index 00000000000..451e936d9b5
--- /dev/null
+++ b/tensorflow/python/distribute/custom_training_loop_optimizer_test.py
@@ -0,0 +1,101 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for custom training loops that involves advanced optimizer usage."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python import keras
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+
+
+class OptimizerTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          combinations.combine(
+              distribution=strategy_combinations.multidevice_strategies,
+              mode=["eager"],
+          ),
+          combinations.concat(
+              combinations.combine(
+                  all_reduce_sum_gradients=True,
+                  expected=[[[-0.3, -0.3], [-0.3, -0.3]]]),
+              combinations.combine(
+                  all_reduce_sum_gradients=False,
+                  expected=[[[-0.1, -0.1], [-0.2, -0.2]]]),
+          )))
+  def test_custom_aggregation(self, distribution, all_reduce_sum_gradients,
+                              expected):
+
+    with distribution.scope():
+      v = variables.Variable([0., 0.])
+      optimizer = keras.optimizer_v2.gradient_descent.SGD(0.1)
+
+    @def_function.function
+    def optimize():
+      grads = values.PerReplica([
+          ops.convert_to_tensor([1., 1.]),
+          ops.convert_to_tensor([2., 2.]),
+      ])
+
+      def step_fn(grads):
+        optimizer.apply_gradients(
+            [(grads, v)], all_reduce_sum_gradients=all_reduce_sum_gradients)
+        return v.read_value()
+
+      return distribution.experimental_local_results(
+          distribution.experimental_run_v2(step_fn, args=(grads,)))
+
+    self.assertAllClose(optimize(), expected)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.one_device_strategy,
+          mode=["eager"],
+          all_reduce_sum_gradients=[True, False]))
+  def test_custom_aggregation_one_device(self, distribution,
+                                         all_reduce_sum_gradients):
+
+    with distribution.scope():
+      v = variables.Variable([0., 0.])
+      optimizer = keras.optimizer_v2.gradient_descent.SGD(0.1)
+
+    @def_function.function
+    def optimize():
+      grads = ops.convert_to_tensor([1., 1.])
+
+      def step_fn(grads):
+        optimizer.apply_gradients(
+            [(grads, v)], all_reduce_sum_gradients=all_reduce_sum_gradients)
+        return v.read_value()
+
+      return distribution.experimental_local_results(
+          distribution.experimental_run_v2(step_fn, args=(grads,)))
+
+    self.assertAllClose(optimize(), [[-0.1, -0.1]])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index ab088c24de1..6b73963530f 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -27,6 +27,7 @@ import six
 
 from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
+from tensorflow.python.distribute import values as ds_values
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -158,6 +159,10 @@ class OptimizerV2(trackable.Trackable):
   `tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE` for averaging or
   `tf.keras.losses.Reduction.SUM` for not.
 
+  To aggregate gradients yourself, call `apply_gradients` with
+  `all_reduce_sum_gradients` set to False. This is useful if you need to process
+  aggregated gradients.
+
   If you are not using these and you want to average gradients, you should use
   `tf.math.reduce_sum` to add up your per-example losses and then divide by the
   global batch size. Note that when using `tf.distribute.Strategy`, the first
@@ -415,16 +420,36 @@ class OptimizerV2(trackable.Trackable):
       grads = self._clip_gradients(grads)
     return grads
 
-  def apply_gradients(self, grads_and_vars, name=None):
+  def apply_gradients(self,
+                      grads_and_vars,
+                      name=None,
+                      all_reduce_sum_gradients=True):
     """Apply gradients to variables.
 
     This is the second part of `minimize()`. It returns an `Operation` that
     applies gradients.
 
+    The method sums gradients from all replicas in the presence of
+    `tf.distribute.Strategy` by default. You can aggregate gradients yourself by
+    passing `all_reduce_sum_gradients=False`.
+
+    Example:
+
+    ```python
+    grads = tape.gradient(loss, vars)
+    grads = tf.distribute.get_replica_context().all_reduce('sum', grads)
+    # Processing aggregated gradients.
+    optimizer.apply_gradients(zip(grads, vars), all_reduce_sum_gradients=False)
+
+    ```
+
     Args:
       grads_and_vars: List of (gradient, variable) pairs.
       name: Optional name for the returned operation.  Default to the name
         passed to the `Optimizer` constructor.
+      all_reduce_sum_gradients: Whether to sum gradients from different
+        replicas in the presense of `tf.distribute.Strategy`. If False, it's
+        user responsibility to aggregate the gradients. Default to True.
 
     Returns:
       An `Operation` that applies the specified gradients. The `iterations`
@@ -452,18 +477,23 @@ class OptimizerV2(trackable.Trackable):
       return distribute_ctx.get_replica_context().merge_call(
           functools.partial(self._distributed_apply, apply_state=apply_state),
           args=(grads_and_vars,),
-          kwargs={"name": name})
+          kwargs={
+              "name": name,
+              "all_reduce_sum_gradients": all_reduce_sum_gradients,
+          })
 
   def _aggregate_gradients(self, distribution, grads_and_vars):
     """Returns all-reduced gradients."""
     return distribution.extended.batch_reduce_to(
         ds_reduce_util.ReduceOp.SUM, grads_and_vars)
 
-  def _distributed_apply(self, distribution, grads_and_vars, name, apply_state):
+  def _distributed_apply(self, distribution, grads_and_vars, name, apply_state,
+                         all_reduce_sum_gradients):
     """`apply_gradients` using a `DistributionStrategy`."""
-    reduced_grads = self._aggregate_gradients(distribution, grads_and_vars)
-    var_list = [v for _, v in grads_and_vars]
-    grads_and_vars = zip(reduced_grads, var_list)
+    if all_reduce_sum_gradients:
+      reduced_grads = self._aggregate_gradients(distribution, grads_and_vars)
+      var_list = [v for _, v in grads_and_vars]
+      grads_and_vars = zip(reduced_grads, var_list)
 
     def apply_grad_to_update_var(var, grad):
       """Apply gradient to variable."""
@@ -493,6 +523,14 @@ class OptimizerV2(trackable.Trackable):
     update_ops = []
     with ops.name_scope(name or self._name, skip_on_eager=True):
       for grad, var in grads_and_vars:
+        # TODO(crccw): It's not allowed to assign PerReplica value to
+        # MirroredVariable.  Remove this after we relax this restriction.
+        def _assume_mirrored(grad):
+          if isinstance(grad, ds_values.PerReplica):
+            return ds_values.Mirrored(grad.values)
+          return grad
+
+        grad = nest.map_structure(_assume_mirrored, grad)
         # Colocate the update with variables to avoid unnecessary communication
         # delays. See b/136304694.
         with distribution.extended.colocate_vars_with(var):
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
index 2b74c3fa12f..f8985de0c66 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
@@ -621,6 +621,36 @@ class OptimizerTest(test.TestCase):
     opt.minimize(lambda: constant_op.constant(1.), [])
     opt.apply_gradients([])
 
+  @test_util.run_in_graph_and_eager_modes
+  def testAggregationTrue(self):
+    # Test that all_reduce_sum_gradients=True works without distributed
+    # strategy.
+    var = resource_variable_ops.ResourceVariable([1., 2.])
+    opt = gradient_descent.SGD(3.0)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose([1., 2.], self.evaluate(var))
+    opt_op = opt.apply_gradients([([0.1, 0.1], var)],
+                                 all_reduce_sum_gradients=True)
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(opt_op)
+    self.assertAllClose([0.7, 1.7], self.evaluate(var))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAggregationFalse(self):
+    # Test that all_reduce_sum_gradients=False works without distributed
+    # strategy.
+    var = resource_variable_ops.ResourceVariable([1., 2.])
+    opt = gradient_descent.SGD(3.0)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose([1., 2.], self.evaluate(var))
+    opt_op = opt.apply_gradients([([0.1, 0.1], var)],
+                                 all_reduce_sum_gradients=False)
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(opt_op)
+    self.assertAllClose([0.7, 1.7], self.evaluate(var))
+
 
 @keras_parameterized.run_all_keras_modes
 class OptimizersCompatibilityTest(keras_parameterized.TestCase):
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
index 84718036246..aaf0e8cc131 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
index 0466ea65fa3..2abbf63ada3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
index 9762fad5d0f..c7c04aa59cf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
index f477a60d237..a507e04483b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
index 9b736df5819..53b091a553b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
index 3ffb4bb8b4d..80a8e3a90db 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
index 9639c71ce41..e95145b1fc5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
@@ -25,7 +25,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index 2a7603d69b4..7238e24bf29 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
index c85e88ab649..e4bbdc3ec55 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
index 84718036246..aaf0e8cc131 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
index 0466ea65fa3..2abbf63ada3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
index 9762fad5d0f..c7c04aa59cf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
index f477a60d237..a507e04483b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
index 9b736df5819..53b091a553b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
index 3ffb4bb8b4d..80a8e3a90db 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
index 9639c71ce41..e95145b1fc5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
@@ -25,7 +25,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index 2a7603d69b4..7238e24bf29 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
index c85e88ab649..e4bbdc3ec55 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt
index 2b476fafa9a..8db3a63c868 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt
index be2fedfe81f..8505aa299e6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
index 919c433648f..2014e181484 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt
index 67fce4f5c63..a30f2a9afa4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
index 43bf48ef5d4..f83fcd959de 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt
index 06363234ea6..a21c2d9790c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt
index 041922bdfd1..611044aa9c6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt
@@ -25,7 +25,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt
index 5deef618248..a49290a1227 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
index 8a24dcfd2d0..6ac6872477d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
@@ -26,7 +26,7 @@ tf_class {
   }
   member_method {
     name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\', \'all_reduce_sum_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "from_config"

From 97c7e733c562e74deb786ae964dcd88d4d93eb6d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 20:46:35 -0800
Subject: [PATCH 327/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 296121312 Change-Id:
 I8eefa17cfa266fb6d642381d63ecfc0c6ffa0ba0

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 449a95765a5..ecdce1e627b 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21345,7 +21345,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22053,7 +22053,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22249,7 +22249,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22318,7 +22318,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22433,7 +22433,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22492,7 +22492,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22666,7 +22666,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22857,7 +22857,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25297,7 +25297,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25629,7 +25629,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25679,7 +25679,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25929,7 +25929,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26559,7 +26559,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27624,7 +27624,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45536,7 +45536,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From db85f4c207145961c6c671745e410ed55f57616e Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Wed, 19 Feb 2020 22:04:16 -0800
Subject: [PATCH 328/442] [XLA:GPU] Add an AllReduceCombiner pass, that merges
 AllReduce operations.

On GPU, implement combined allreduces using NCCL groups.

PiperOrigin-RevId: 296130269
Change-Id: I763f0139c8ed9a59d7d691e3252e6b46244fefd6
---
 tensorflow/compiler/xla/service/BUILD         |  45 ++
 .../xla/service/all_reduce_combiner.cc        | 452 +++++++++++++++++
 .../xla/service/all_reduce_combiner.h         |  51 ++
 .../xla/service/all_reduce_combiner_test.cc   | 477 ++++++++++++++++++
 .../xla/service/collective_ops_utils.h        |  23 +-
 .../compiler/xla/service/cpu/cpu_runtime.cc   |  32 +-
 tensorflow/compiler/xla/service/gpu/BUILD     |   1 +
 .../xla/service/gpu/dummy_all_reduce_thunk.cc |   8 +-
 .../compiler/xla/service/gpu/gpu_compiler.cc  |   9 +-
 .../xla/service/gpu/ir_emitter_unnested.cc    |  93 ++--
 .../xla/service/gpu/nccl_all_reduce_thunk.cc  |  81 +--
 .../xla/service/gpu/nccl_all_reduce_thunk.h   |  13 +-
 .../compiler/xla/tests/collective_ops_test.cc |  49 ++
 13 files changed, 1229 insertions(+), 105 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/all_reduce_combiner.cc
 create mode 100644 tensorflow/compiler/xla/service/all_reduce_combiner.h
 create mode 100644 tensorflow/compiler/xla/service/all_reduce_combiner_test.cc

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 7dc03511f30..34fd40f11d8 100755
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1947,6 +1947,51 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "all_reduce_combiner",
+    srcs = ["all_reduce_combiner.cc"],
+    hdrs = ["all_reduce_combiner.h"],
+    deps = [
+        ":hlo",
+        ":hlo_domain_map",
+        ":hlo_pass",
+        ":hlo_query",
+        ":hlo_reachability",
+        ":shape_inference",
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "all_reduce_combiner_test",
+    srcs = ["all_reduce_combiner_test.cc"],
+    deps = [
+        ":all_reduce_combiner",
+        ":hlo",
+        ":hlo_matchers",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
 cc_library(
     name = "all_reduce_simplifier",
     srcs = ["all_reduce_simplifier.cc"],
diff --git a/tensorflow/compiler/xla/service/all_reduce_combiner.cc b/tensorflow/compiler/xla/service/all_reduce_combiner.cc
new file mode 100644
index 00000000000..2b41f19f288
--- /dev/null
+++ b/tensorflow/compiler/xla/service/all_reduce_combiner.cc
@@ -0,0 +1,452 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/all_reduce_combiner.h"
+
+#include <algorithm>
+#include <list>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/hlo_domain_map.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+// Combines the elements of to_combine into a single AllReduce op. All
+// entries in to_combine must be AllReduce ops with exactly one operand
+// and the same reduction operation.
+Status CombineAllReduces(absl::Span<HloInstruction* const> to_combine) {
+  if (to_combine.size() < 2) {
+    return Status::OK();
+  }
+  VLOG(1) << "Combined " << to_combine.size() << " CRS ops";
+
+  HloComputation& computation = *to_combine.back()->parent();
+  HloComputation* reduction = to_combine[0]->to_apply();
+  const HloOpcode type = reduction->root_instruction()->opcode();
+
+  // Create a single bigger AllReduce of the operands of the smaller
+  // AllReduces.
+  std::vector<HloInstruction*> operands;
+  std::vector<Shape> operand_shapes;
+  VLOG(1) << "Combining set";
+  for (HloInstruction* hlo : to_combine) {
+    VLOG(1) << "Set element: " << hlo->ToString();
+    TF_RET_CHECK(hlo->opcode() == HloOpcode::kAllReduce);
+    TF_RET_CHECK(hlo->operands().size() == 1);
+    TF_RET_CHECK(hlo->to_apply() == reduction ||
+                 (hlo->to_apply()->instruction_count() == 3 &&
+                  hlo->to_apply()->num_parameters() == 2 &&
+                  hlo->to_apply()->root_instruction()->opcode() == type));
+    TF_RET_CHECK(hlo->shape().IsArray());
+    for (HloInstruction* operand : hlo->operands()) {
+      operands.push_back(operand);
+      operand_shapes.push_back(operand->shape());
+    }
+  }
+
+  HloInstruction* combined;
+  // AllReduce ops with more than one operand produce a tuple.
+  TF_RET_CHECK(operands.size() >= 2);
+  combined = computation.AddInstruction(HloInstruction::CreateAllReduce(
+      ShapeUtil::MakeTupleShape(operand_shapes), operands, reduction,
+      to_combine.front()->replica_groups(),
+      /*constrain_layout=*/false, to_combine.front()->channel_id()));
+
+  // We have to propagate the sharding manually because Domain instructions are
+  // not guaranteed to preserve it for side effecting instructions.
+  if (to_combine.front()->has_sharding()) {
+    combined->set_sharding(to_combine.front()->sharding());
+  }
+  VLOG(1) << "Replacing with : " << combined->ToString();
+
+  // Replace all the smaller AllReduces with elements of the tuple output
+  // of the single bigger AllReduce.
+  for (int64 i = 0; i < to_combine.size(); ++i) {
+    auto replace_with = HloInstruction::CreateGetTupleElement(
+        to_combine[i]->shape(), combined, i);
+    TF_RETURN_IF_ERROR(computation.ReplaceWithNewInstruction(
+        to_combine[i], std::move(replace_with)));
+  }
+  return Status::OK();
+}
+
+struct GroupKey {
+  GroupKey(const HloInstruction* hlo, const HloDomainMap& domain_map)
+      : opcode(hlo->to_apply()->root_instruction()->opcode()),
+        accum_type(hlo->to_apply()->root_instruction()->shape().element_type()),
+        domain_id(domain_map.GetDomainMetadataId(hlo)),
+        is_cross_shard(hlo->channel_id().has_value()),
+        replica_groups(hlo->replica_groups()) {}
+
+  bool operator<(const GroupKey& other) const {
+    if (opcode != other.opcode) {
+      return opcode < other.opcode;
+    }
+    if (accum_type != other.accum_type) {
+      return accum_type < other.accum_type;
+    }
+    if (domain_id != other.domain_id) {
+      return domain_id < other.domain_id;
+    }
+    if (is_cross_shard != other.is_cross_shard) {
+      return is_cross_shard < other.is_cross_shard;
+    }
+    if (replica_groups.size() != other.replica_groups.size()) {
+      return replica_groups.size() < other.replica_groups.size();
+    }
+    for (int64 i = 0; i < replica_groups.size(); ++i) {
+      const auto& rg = replica_groups[i];
+      const auto& org = other.replica_groups[i];
+      if (rg.replica_ids_size() != org.replica_ids_size()) {
+        return rg.replica_ids_size() < org.replica_ids_size();
+      }
+      for (int64 j = 0; j < rg.replica_ids_size(); ++j) {
+        if (rg.replica_ids(j) != org.replica_ids(j)) {
+          return rg.replica_ids(j) < org.replica_ids(j);
+        }
+      }
+    }
+    return false;
+  }
+
+  HloOpcode opcode;
+  PrimitiveType accum_type;
+  int64 domain_id;
+  bool is_cross_shard;
+  std::vector<ReplicaGroup> replica_groups;
+};
+
+// Group AllReduce instructions by the reduction types, e.g., add, min,
+// max, replica groups and domain. For cross-module all reduce instructions
+// we group them by the set of domains they are reducing across.
+//
+// Note that the shape of the reduction computation is not included in the
+// reduction types, e.g.: "f32[] add" and "bf16[] add" will be the same type. We
+// need to disallow combining CRS instructions with different domain metadata as
+// well as that could end up short-cutting two or more different domains.
+//
+// In each group, the instructions should be in post order. We will then iterate
+// each group and try to combine them, so to prevent non-determinism, we use
+// std::map here.
+//
+// The return value is a list of groups where every group contains a list of
+// all-reduce instruction sets in topological order and with a deterministic
+// order within the set. Additionally due to the above constraints every all
+// reduce set within a group will contain the same number of elements
+// and every instruction within an all reduce set will have the same
+// all-reduce-id (if specified) and thus shape (all reduce sets without an
+// all-reduce-id will have a single instruction).
+using InstructionGroups =
+    std::vector<std::vector<std::vector<HloInstruction*>>>;
+StatusOr<InstructionGroups> CreateComputationGroups(
+    HloComputation* computation) {
+  TF_ASSIGN_OR_RETURN(auto domain_map, HloDomainMap::Create(computation, ""));
+
+  // Group instructions by opcode, domain id and replica group.
+  std::map<GroupKey, std::vector<HloInstruction*>> opcode_groups;
+  for (HloInstruction* instruction : computation->MakeInstructionPostOrder()) {
+    if (instruction->opcode() != HloOpcode::kAllReduce) {
+      continue;
+    }
+    if (instruction->to_apply()->instruction_count() != 3 ||
+        instruction->to_apply()->num_parameters() != 2) {
+      VLOG(1) << "Skipping due to non-trivial reduction function.";
+      continue;
+    }
+    opcode_groups[GroupKey(instruction, *domain_map)].push_back(instruction);
+  }
+
+  // Generate a unique all-reduce-id for instructions without one by negating
+  // the unique id of the hlo. This way we can treat cross module and normal CRS
+  // instructions uniformly.
+  auto channel_id = [](const HloInstruction* all_reduce) {
+    return all_reduce->IsCrossModuleAllReduce()
+               ? all_reduce->channel_id().value()
+               : -1 * all_reduce->unique_id();
+  };
+
+  // Group instructions by all-reduce id with instructions for an all-reduce id
+  // is listed along their group id and the (group id, instruction) pairs are
+  // sorted by group id in the vector.
+  std::map<int64, std::vector<std::pair<int64, HloInstruction*>>>
+      all_reduce_sets;
+  int64 group_id = 0;
+  for (auto& domain_groups : opcode_groups) {
+    for (HloInstruction* hlo : domain_groups.second) {
+      all_reduce_sets[channel_id(hlo)].emplace_back(group_id, hlo);
+    }
+    ++group_id;
+  }
+
+  // Group instructions by participating group ids. Instructions within a group
+  // are sorted by topological order and instructions within an all reduce group
+  // is still sorted by group id.
+  std::map<std::vector<int64>, std::vector<std::vector<HloInstruction*>>>
+      all_reduce_group_map;
+  for (HloInstruction* instruction : computation->MakeInstructionPostOrder()) {
+    if (instruction->opcode() != HloOpcode::kAllReduce) {
+      continue;
+    }
+    if (instruction->to_apply()->instruction_count() != 3 ||
+        instruction->to_apply()->num_parameters() != 2) {
+      VLOG(1) << "Skipping due to non-trivial reduction function.";
+      continue;
+    }
+
+    int64 arid = channel_id(instruction);
+    if (all_reduce_sets.count(arid) == 0) {
+      // Already processed.
+      continue;
+    }
+
+    std::vector<int64> group_ids;
+    std::vector<HloInstruction*> instructions;
+    for (const auto& hlo : all_reduce_sets[arid]) {
+      group_ids.push_back(hlo.first);
+      instructions.push_back(hlo.second);
+    }
+    all_reduce_group_map[group_ids].push_back(std::move(instructions));
+    all_reduce_sets.erase(arid);
+  }
+  CHECK(all_reduce_sets.empty());
+
+  InstructionGroups groups;
+  for (const auto& all_reduce_group : all_reduce_group_map) {
+    groups.push_back(all_reduce_group.second);
+  }
+  return std::move(groups);
+}
+
+}  // namespace
+
+AllReduceCombiner::AllReduceCombiner(int64 combine_threshold_in_bytes,
+                                     int64 combine_threshold_count)
+    : combine_threshold_in_bytes_(combine_threshold_in_bytes),
+      combine_threshold_count_(combine_threshold_count) {}
+
+StatusOr<bool> AllReduceCombiner::Run(HloModule* module) {
+  VLOG(1) << "Running AllReduceCombiner with threshold of "
+          << combine_threshold_in_bytes_ << " bytes";
+
+  if (hlo_query::ContainsLayoutConstrainedAllReduce(*module)) {
+    VLOG(1) << "Skip AllReduceCombiner because the module contains all-reduce "
+               "with constrained layouts";
+    return false;
+  }
+
+  bool changed = false;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    TF_ASSIGN_OR_RETURN(auto groups, CreateComputationGroups(computation));
+    for (auto group : groups) {
+      // Recompute reachability after every combine group because we can't
+      // maintain a cross group topolgical order to be able to rely on the
+      // transitive dependencies to detect cycles.
+      auto reachability = HloReachabilityMap::Build(computation);
+
+      // Create a map to be able to find an instruction group based on the first
+      // instruction in the group. It will be used during the post order
+      // iteration to be able to process full groups at a time. Doing it only
+      // for one instruction in every group will be sufficient because all
+      // instruction have to schedule at the same time due to cross core
+      // dependencies.
+      absl::flat_hash_map<HloInstruction*, std::vector<HloInstruction*>*>
+          group_map;
+      for (auto& instruction : group) {
+        group_map[instruction.front()] = &instruction;
+      }
+
+      // Collect sets of AllReduce instructions to combine.
+      std::vector<std::vector<std::vector<HloInstruction*>>> combine_sets(1);
+      int64 current_size_in_bytes = 0;
+      int64 current_operand_count = 0;
+
+      // Iterate all instructions in post order and skip the ones not in the
+      // current group. We have to create a new post order iteration for every
+      // group because merging instructions in the previous group can made the
+      // original post order no longer hold.
+      // This will make it likely that we won't increase memory pressure much
+      // above combine_threshold_in_bytes, since two AllReduces that are
+      // near in post order are most likely, but not for sure, also near in
+      // scheduled order.
+      //
+      // TODO(b/70235266): This should usually be fine, but it's probably
+      // possible to construct some case where the memory usage increases beyond
+      // the threshold due to reordering of the instructions in scheduling. If
+      // this ever comes up as a real problem, it would be nice to implement
+      // safeguards so that that cannot possibly happen.
+      for (const HloInstruction* inst :
+           computation->MakeInstructionPostOrder()) {
+        auto it = group_map.find(inst);
+        if (it == group_map.end()) {
+          // Instruction belongs to a different group.
+          continue;
+        }
+        const auto& instructions = *it->second;
+
+        VLOG(1) << "Considering HLO " << instructions.front()->ToString()
+                << " with current set size of " << current_size_in_bytes
+                << " and current operand count of " << current_operand_count;
+
+        // We do not handle AllReduce ops that do not have exactly 1
+        // operand since that is simpler and this pass is the only way to
+        // generate such ops and it should rarely be important to consider the
+        // same ops again.
+        if (instructions.front()->operands().size() != 1) {
+          VLOG(1) << "Skipping due to "
+                  << instructions.front()->operands().size() << " operands";
+          continue;
+        }
+
+        int64 size_in_bytes;
+        TF_RET_CHECK(instructions.front()->shape().IsArray());
+        size_in_bytes = ShapeUtil::ByteSizeOf(instructions.front()->shape());
+
+        if (size_in_bytes > combine_threshold_in_bytes_) {
+          VLOG(1) << "Skipping due to size " << size_in_bytes
+                  << " above threshold";
+          // If the instruction is greather than the threshold, then we can
+          // never combine it with anything.
+          continue;
+        }
+
+        // If the current set is dependent on the instruction, then create a new
+        // one to avoid the dependency. We move on from the current set instead
+        // of ignoring the instruction since otherwise a single AllReduce
+        // instruction that all the other ones depend on (such as one on the
+        // forward pass of a model) could disable this optimization entirely.
+        TF_RET_CHECK(!combine_sets.empty());
+        for (const auto& previous : combine_sets.back()) {
+          // The reachability information does not reflect the planned
+          // combination from combine_sets. We cannot just bring it up to date
+          // cheaply since HloReachabilityMap does not track reachability
+          // updates transitively and doing it directly is expensive. However,
+          // leaving it stale has no effect on the reachability queries that we
+          // are doing here because we are considering the ops in a topological
+          // order, so we can just leave it stale.
+          //
+          // Proof: Suppose A is the instruction we are looking to combine and B
+          // is an element of the current combine set that we are looking to
+          // combine A into.
+          //
+          // First of all, we check that all elements in each set do not depend
+          // on each other, so combining the *current* combine set cannot create
+          // new dependencies between A and B. It remains to prove that
+          // combining the prior combine sets also cannot create a dependency
+          // between A and B.
+          //
+          // Assume to get a contradiction that there are two AllReduce
+          // ops C and D in combine_sets that will be combined and that A and B
+          // are not connected now but that they will be after combining C and
+          // D. Then there exist paths in the dependency graph such that one of
+          // these cases is true:
+          //
+          //   A -> ... -> C and D -> ... -> B
+          //   A -> ... -> D and C -> ... -> B
+          //   B -> ... -> C and D -> ... -> A
+          //   B -> ... -> D and C -> ... -> A
+          //
+          // None of these cases are possible because we are visiting the nodes
+          // in a topological order, so C and D cannot be in-between A and B.
+          // That is a contradiction, so combining the prior combine sets also
+          // cannot create a dependency between A and B.
+          bool new_set = false;
+          for (int64 i = 0; i < instructions.size(); ++i) {
+            if (reachability->IsReachable(previous[i], instructions[i])) {
+              VLOG(1) << "Starting new set due to dependency between "
+                      << previous[i]->ToString() << " AND "
+                      << instructions[i]->ToString();
+              new_set = true;
+              break;
+            }
+          }
+          if (new_set) {
+            combine_sets.emplace_back();
+            current_size_in_bytes = 0;
+            current_operand_count = 0;
+            break;
+          }
+        }
+
+        if (current_size_in_bytes + size_in_bytes >
+                combine_threshold_in_bytes_ ||
+            current_operand_count + 1 > combine_threshold_count_) {
+          VLOG(1) << "The instruction cannot be entered into the set due "
+                     "to the combined size being too large.";
+          // In this case we cannot include the instruction into the current set
+          // since then it would grow beyond the threshold. The set of
+          // instructions to carry forward will either be the current set or the
+          // instruction by itself, whichever is smaller, since that maximizes
+          // the chance of being able to combine with the next instruction.
+          if (size_in_bytes > current_size_in_bytes) {
+            VLOG(1) << "Skipping as the instruction is larger than the set.";
+            continue;  // keep the current set
+          }
+          VLOG(1)
+              << "Resetting the set as the set is larger than the instruction.";
+          combine_sets.emplace_back();
+          current_size_in_bytes = 0;
+          current_operand_count = 0;
+        }
+
+        VLOG(1) << "Adding instruction to set.";
+        combine_sets.back().push_back(instructions);
+        current_size_in_bytes += size_in_bytes;
+        current_operand_count += 1;
+        TF_RET_CHECK(current_size_in_bytes <= combine_threshold_in_bytes_);
+        TF_RET_CHECK(current_operand_count <= combine_threshold_count_);
+      }
+      VLOG(1) << "Done constructing sets. Final set size is "
+              << current_size_in_bytes << " bytes and " << current_operand_count
+              << " operands";
+
+      // Combine the collected sets of AllReduce instructions.
+      for (const auto& combine_set : combine_sets) {
+        if (combine_set.size() >= 2) {
+          changed = true;
+          for (int64 i = 0; i < combine_set.front().size(); ++i) {
+            std::vector<HloInstruction*> to_combine;
+            to_combine.reserve(combine_set.size());
+            for (const auto& c : combine_set) {
+              to_combine.push_back(c[i]);
+            }
+            TF_RETURN_IF_ERROR(CombineAllReduces(to_combine));
+          }
+        }
+      }
+    }
+  }
+
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/all_reduce_combiner.h b/tensorflow/compiler/xla/service/all_reduce_combiner.h
new file mode 100644
index 00000000000..92f85058552
--- /dev/null
+++ b/tensorflow/compiler/xla/service/all_reduce_combiner.h
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ALL_REDUCE_COMBINER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_ALL_REDUCE_COMBINER_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+// Combines small non-dependent AllReduce ops into larger combined
+// AllReduce ops. A typical AllReduce implementation has a minimum
+// latency-induced time for a AllReduce op so a single combined op can be
+// more efficient than many small ones.
+class AllReduceCombiner : public HloModulePass {
+ public:
+  AllReduceCombiner(int64 combine_threshold_in_bytes,
+                    int64 combine_threshold_count);
+
+  absl::string_view name() const override { return "all-reduce-combiner"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  // Combine all reduce ops up to this threshold.
+  int64 combine_threshold_in_bytes_;
+
+  // Combine all reduce ops up to this threshold (number of operands).
+  int64 combine_threshold_count_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_ALL_REDUCE_COMBINER_H_
diff --git a/tensorflow/compiler/xla/service/all_reduce_combiner_test.cc b/tensorflow/compiler/xla/service/all_reduce_combiner_test.cc
new file mode 100644
index 00000000000..0793ba2ba4b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/all_reduce_combiner_test.cc
@@ -0,0 +1,477 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/all_reduce_combiner.h"
+
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+using absl::nullopt;
+using ::testing::AllOf;
+namespace op = xla::testing::opcode_matchers;
+int64 kMaxCombineCount = 256;
+
+int64 AllReduceCount(const HloModule& module) {
+  int64 count = 0;
+  for (HloComputation* computation : module.computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
+    for (HloInstruction* hlo : computation->instructions()) {
+      if (hlo->opcode() == HloOpcode::kAllReduce) {
+        ++count;
+      }
+    }
+  }
+  return count;
+}
+
+// inputs[i] will be some op producing a shape of size sizes_in_kib[i] which
+// feeds into a a all reduce op in all_reduces[i]. Returns a tuple
+// of the all_reduces.
+HloInstruction* MakeCrossReplicaReductions(
+    std::vector<int64> sizes_in_kib, std::vector<HloComputation*> reductions,
+    std::vector<HloInstruction*>* inputs, HloComputation::Builder* b) {
+  CHECK_EQ(reductions.size(), sizes_in_kib.size());
+  std::vector<HloInstruction*> all_reduces;
+  for (int i = 0; i < sizes_in_kib.size(); i++) {
+    int64 size_in_kib = sizes_in_kib[i];
+    HloComputation* reduction = reductions[i];
+    auto constant = b->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.3)));
+    Shape shape = ShapeUtil::MakeShape(
+        F32, {static_cast<int32>(size_in_kib * 1024 / sizeof(float))});
+    auto input =
+        b->AddInstruction(HloInstruction::CreateBroadcast(shape, constant, {}));
+    inputs->push_back(input);
+    all_reduces.push_back(b->AddInstruction(HloInstruction::CreateAllReduce(
+        shape, {input}, reduction, /*replica_groups=*/{},
+        /*constrain_layout=*/false, /*channel_id=*/nullopt)));
+  }
+  return b->AddInstruction(HloInstruction::CreateTuple(all_reduces));
+}
+
+// Create and add a reduction computation in the given type to the module.
+HloComputation* MakeReduction(const HloOpcode type, HloModule* module) {
+  HloComputation::Builder sum_builder(HloOpcodeString(type));
+  auto x = sum_builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {}), "x"));
+  auto y = sum_builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {}), "y"));
+  sum_builder.AddInstruction(
+      HloInstruction::CreateBinary(ShapeUtil::MakeShape(F32, {}), type, x, y));
+  HloComputation* reduction =
+      module->AddEmbeddedComputation(sum_builder.Build());
+  return reduction;
+}
+
+// Creates replica groups for AllReduce. groups[i] represents replica ids
+// for group 'i'.
+std::vector<ReplicaGroup> CreateReplicaGroups(
+    absl::Span<const std::vector<int64>> groups) {
+  std::vector<ReplicaGroup> replica_groups(groups.size());
+  for (int64 i = 0; i < groups.size(); ++i) {
+    *replica_groups[i].mutable_replica_ids() = {groups[i].begin(),
+                                                groups[i].end()};
+  }
+  return replica_groups;
+}
+
+using AllReduceCombinerTest = HloTestBase;
+
+// Tests combination of several AllReduce instructions.
+TEST_F(AllReduceCombinerTest, CombineAllReduces) {
+  auto module = CreateNewVerifiedModule();
+  HloComputation* sum = MakeReduction(HloOpcode::kAdd, module.get());
+
+  HloComputation::Builder b(TestName());
+  std::vector<HloInstruction*> inputs;
+  auto root = MakeCrossReplicaReductions(
+      {1, 2, 10, 7, 6}, {sum, sum, sum, sum, sum}, &inputs, &b);
+  auto computation = module->AddEntryComputation(b.Build());
+
+  // Run the AllReduce combiner optimization pass.
+  AllReduceCombiner combine(10 * 1024 * 1024, kMaxCombineCount);
+  ASSERT_EQ(AllReduceCount(*module), inputs.size());
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, combine.Run(module.get()));
+  ASSERT_EQ(AllReduceCount(*module), 1);
+  EXPECT_TRUE(changed);
+
+  ASSERT_EQ(root, computation->root_instruction());
+  ASSERT_EQ(inputs.size(), root->operands().size());
+
+  HloInstruction* combined = nullptr;
+  for (int64 i = 0; i < root->operands().size(); ++i) {
+    HloInstruction* hlo = root->mutable_operand(i);
+    ASSERT_TRUE(hlo->opcode() == HloOpcode::kGetTupleElement);
+    EXPECT_EQ(hlo->tuple_index(), i);
+    EXPECT_TRUE(ShapeUtil::Equal(inputs[i]->shape(), hlo->shape()));
+
+    if (combined == nullptr) {
+      // Verify the combined all reduce instruction.
+      combined = hlo->mutable_operand(0);
+      ASSERT_TRUE(combined->opcode() == HloOpcode::kAllReduce);
+      EXPECT_TRUE(ShapeUtil::Equal(root->shape(), combined->shape()));
+      ASSERT_EQ(combined->operands().size(), inputs.size());
+    }
+    EXPECT_EQ(combined, hlo->operand(0));
+    EXPECT_TRUE(ShapeUtil::Equal(inputs[i]->shape(), hlo->shape()));
+    EXPECT_EQ(combined->operand(i), inputs[i]);
+    EXPECT_EQ(1, inputs[i]->users().size());
+  }
+  ASSERT_NE(combined, nullptr);
+}
+
+// Tests combination of several cross replica reduction instructions in
+// different types.k
+TEST_F(AllReduceCombinerTest, CombineCrossReplicaReductionsInGroups) {
+  auto module = CreateNewVerifiedModule();
+  HloComputation* sum = MakeReduction(HloOpcode::kAdd, module.get());
+  HloComputation* min = MakeReduction(HloOpcode::kMinimum, module.get());
+  HloComputation* max = MakeReduction(HloOpcode::kMaximum, module.get());
+  HloComputation* sum_2 = MakeReduction(HloOpcode::kAdd, module.get());
+
+  HloComputation::Builder b(TestName());
+  std::vector<HloInstruction*> inputs;
+  MakeCrossReplicaReductions(
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+      {sum, sum_2, min, min, min, max, max, max, sum, sum_2}, &inputs, &b);
+  module->AddEntryComputation(b.Build());
+
+  // Run the AllReduce combiner optimization pass.
+  AllReduceCombiner combine(10 * 1024 * 1024, kMaxCombineCount);
+  ASSERT_EQ(AllReduceCount(*module), inputs.size());
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, combine.Run(module.get()));
+  ASSERT_EQ(AllReduceCount(*module), 3)
+      << "expects 3 groups for 3 reduction types.";
+  EXPECT_TRUE(changed);
+}
+
+// Tests that the combination threshold is respected.
+TEST_F(AllReduceCombinerTest, RespectThreshold) {
+  auto module = CreateNewVerifiedModule();
+  HloComputation* sum = MakeReduction(HloOpcode::kAdd, module.get());
+
+  HloComputation::Builder b(TestName());
+  std::vector<HloInstruction*> inputs;
+  MakeCrossReplicaReductions({8, 4}, {sum, sum}, &inputs, &b);
+  module->AddEntryComputation(b.Build());
+
+  // Run the AllReduce combiner optimization pass with threshold less than
+  // the combined size of the all reduce ops so that the combination
+  // cannot occur.
+  {
+    AllReduceCombiner combine((8 + 4) * 1024 - 1, kMaxCombineCount);
+    ASSERT_EQ(AllReduceCount(*module), inputs.size());
+    TF_ASSERT_OK_AND_ASSIGN(bool changed, combine.Run(module.get()));
+    EXPECT_EQ(AllReduceCount(*module), inputs.size());
+    EXPECT_FALSE(changed);
+  }
+
+  // Run the AllReduce combiner optimization pass again with a slightly
+  // higher threshold so that the combination can occur.
+  {
+    AllReduceCombiner combine((8 + 4) * 1024, kMaxCombineCount);
+    ASSERT_EQ(AllReduceCount(*module), inputs.size());
+    TF_ASSERT_OK_AND_ASSIGN(bool changed, combine.Run(module.get()));
+    EXPECT_EQ(AllReduceCount(*module), 1);
+    EXPECT_TRUE(changed);
+  }
+}
+
+// Tests that dependent all reduces are not combined.
+TEST_F(AllReduceCombinerTest, NoDependentCombination) {
+  auto module = CreateNewVerifiedModule();
+  HloComputation* reduction = MakeReduction(HloOpcode::kAdd, module.get());
+
+  HloComputation::Builder b(TestName());
+  auto constant = b.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.3)));
+  auto all_reduce = b.AddInstruction(HloInstruction::CreateAllReduce(
+      constant->shape(), {constant}, reduction, /*replica_groups=*/{},
+      /*constrain_layout=*/false, /*channel_id=*/nullopt));
+  b.AddInstruction(HloInstruction::CreateAllReduce(
+      constant->shape(), {all_reduce}, reduction,
+      /*replica_groups=*/{}, /*constrain_layout=*/false,
+      /*channel_id=*/nullopt));
+
+  module->AddEntryComputation(b.Build());
+
+  AllReduceCombiner combine(1024 * 1024, kMaxCombineCount);
+  ASSERT_EQ(AllReduceCount(*module), 2);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, combine.Run(module.get()));
+  EXPECT_EQ(AllReduceCount(*module), 2);
+  EXPECT_FALSE(changed);
+}
+
+// Tests that AllReduce ops with different groups are not combined.
+TEST_F(AllReduceCombinerTest, GroupAllReduce) {
+  auto module = CreateNewVerifiedModule();
+  HloComputation::Builder b(TestName());
+  HloComputation* reduction = MakeReduction(HloOpcode::kAdd, module.get());
+
+  auto constant = b.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0(42.3)));
+  auto crs0 = b.AddInstruction(
+      HloInstruction::CreateAllReduce(constant->shape(), {constant}, reduction,
+                                      CreateReplicaGroups({{0, 1}, {2, 3}}),
+                                      /*constrain_layout=*/false,
+                                      /*channel_id=*/nullopt));
+  auto crs1 = b.AddInstruction(
+      HloInstruction::CreateAllReduce(constant->shape(), {constant}, reduction,
+                                      CreateReplicaGroups({{0, 2}, {1, 3}}),
+                                      /*constrain_layout=*/false,
+                                      /*channel_id=*/nullopt));
+  b.AddInstruction(HloInstruction::CreateTuple({crs0, crs1}));
+
+  module->AddEntryComputation(b.Build());
+
+  AllReduceCombiner combine(1024 * 1024, kMaxCombineCount);
+  ASSERT_EQ(AllReduceCount(*module), 2);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, combine.Run(module.get()));
+  EXPECT_EQ(AllReduceCount(*module), 2);
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(AllReduceCombinerTest, DomainPreventsCombining) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+summit {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  param0 = f32[128] parameter(0), sharding={maximal device=0}
+  param1 = f32[128] parameter(1), sharding={maximal device=1}
+  crs0 = f32[128] all-reduce(param0),
+    replica_groups={}, to_apply=summit, sharding={maximal device=0}
+  crs1 = f32[128] all-reduce(param1),
+    replica_groups={}, to_apply=summit, sharding={maximal device=1}
+  domain0 = f32[128] domain(crs0),
+    domain={kind="sharding", entry={{maximal device=0}, {maximal device=1}}, exit={maximal device=0}}
+  domain1 = f32[128] domain(crs1),
+    domain={kind="sharding", entry={{maximal device=0}, {maximal device=1}}, exit={maximal device=1}}
+  ROOT tuple = (f32[128], f32[128]) tuple(domain0, domain1),
+    sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  LOG(INFO) << "Original module:\n" << module->ToString();
+
+  AllReduceCombiner combine(1024 * 1024, kMaxCombineCount);
+  ASSERT_EQ(AllReduceCount(*module), 2);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, combine.Run(module.get()));
+  EXPECT_EQ(AllReduceCount(*module), 2);
+  EXPECT_FALSE(changed);
+}
+
+// This test checks that two CRS instructions that are in separate domains
+// but with the same domain metadata can be combined.
+TEST_F(AllReduceCombinerTest, CombineFromTwoDomainsWithSameMetadata) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+summit {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  param0 = f32[128] parameter(0), sharding={maximal device=0}
+  param1 = f32[128] parameter(1), sharding={maximal device=1}
+  param2 = f32[128] parameter(2), sharding={maximal device=1}
+  crs0 = f32[128] all-reduce(param0),
+    replica_groups={}, to_apply=summit, sharding={maximal device=0}
+  crs1 = f32[128] all-reduce(param1),
+    replica_groups={}, to_apply=summit, sharding={maximal device=1}
+  crs2 = f32[128] all-reduce(param2),
+    replica_groups={}, to_apply=summit, sharding={maximal device=0}
+  domain0 = f32[128] domain(crs0),
+    domain={kind="sharding", entry={{maximal device=0}, {maximal device=1},
+    {maximal device=0}}, exit={maximal device=0}}
+  domain1 = f32[128] domain(crs1),
+    domain={kind="sharding", entry={{maximal device=0}, {maximal device=1},
+    {maximal device=0}}, exit={maximal device=1}}
+  domain2 = f32[128] domain(crs2),
+    domain={kind="sharding", entry={{maximal device=0}, {maximal device=1},
+    {maximal device=0}}, exit={maximal device=0}}
+  ROOT tuple = (f32[128], f32[128], f32[128]) tuple(domain0, domain1, domain2),
+    sharding={{maximal device=0}, {maximal device=1}, {maximal device=0}}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AllReduceCombiner combine(1024 * 1024, kMaxCombineCount);
+  ASSERT_EQ(AllReduceCount(*module), 3);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, combine.Run(module.get()));
+  EXPECT_EQ(AllReduceCount(*module), 2);
+  EXPECT_TRUE(changed);
+}
+
+TEST_F(AllReduceCombinerTest, DoNotCombineCrossShardAndCrosReplicaInSPMD) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+summit {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  param0 = f32[128] parameter(0), sharding={maximal device=0}
+  param1 = f32[128] parameter(1), sharding={maximal device=1}
+  cross_shard_ar = f32[128] all-reduce(param0),
+    replica_groups={{0}}, to_apply=summit, channel_id=1
+  cross_replica_ar = f32[128] all-reduce(param1),
+    replica_groups={{0}}, to_apply=summit, sharding={maximal device=1}
+  ROOT tuple = (f32[128], f32[128]) tuple(cross_shard_ar, cross_replica_ar)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AllReduceCombiner combine(1024 * 1024, kMaxCombineCount);
+  ASSERT_EQ(AllReduceCount(*module), 2);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, combine.Run(module.get()));
+  EXPECT_EQ(AllReduceCount(*module), 2);
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(AllReduceCombinerTest, CrossCoreAllReduce) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+summit {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  param0 = f32[128] parameter(0), sharding={maximal device=0}
+  param1 = f32[128] parameter(1), sharding={maximal device=1}
+  crs00 = f32[128] all-reduce(param0),
+    replica_groups={{0}}, channel_id=1, to_apply=summit,
+    sharding={maximal device=0}
+  crs01 = f32[128] all-reduce(param1),
+    replica_groups={{0}}, channel_id=1, to_apply=summit,
+    sharding={maximal device=1}
+  crs10 = f32[128] all-reduce(param0),
+    replica_groups={{0}}, channel_id=2, to_apply=summit,
+    sharding={maximal device=0}
+  crs11 = f32[128] all-reduce(param1),
+    replica_groups={{0}}, channel_id=2, to_apply=summit,
+    sharding={maximal device=1}
+  domain0 = f32[128] domain(crs00),
+    domain={kind="sharding", entry={maximal device=0}, exit={maximal device=1}}
+  ROOT add = f32[128] add(domain0, crs11),
+    sharding={maximal device=1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AllReduceCombiner combine(1024 * 1024, kMaxCombineCount);
+  ASSERT_EQ(AllReduceCount(*module), 4);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, combine.Run(module.get()));
+  EXPECT_EQ(AllReduceCount(*module), 2);
+  EXPECT_TRUE(changed);
+
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Add(op::Domain(op::GetTupleElement(
+                  AllOf(op::AllReduce(op::Parameter(0), op::Parameter(0)),
+                        op::Shape("(f32[128], f32[128])")),
+                  1)),
+              op::GetTupleElement(
+                  AllOf(op::AllReduce(op::Parameter(1), op::Parameter(1)),
+                        op::Shape("(f32[128], f32[128])")),
+                  0)));
+}
+
+TEST_F(AllReduceCombinerTest, CrossCombineGroupCycle) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%add {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+%max {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] maximum(lhs, rhs)
+}
+ENTRY %comp {
+  p0 = f32[128] parameter(0)
+  p1 = f32[128] parameter(1)
+
+  crs00 = f32[128] all-reduce(p0), to_apply=add
+  crs10 = f32[128] all-reduce(p1), to_apply=max
+
+  crs01 = f32[128] all-reduce(crs00), to_apply=max
+  crs11 = f32[128] all-reduce(crs10), to_apply=add
+  add0 = f32[128] add(crs01, crs11)
+
+  crs02 = f32[128] all-reduce(add0), to_apply=add
+  crs12 = f32[128] all-reduce(crs11), to_apply=add
+  ROOT tuple = (f32[128], f32[128]) tuple(crs02, crs12)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AllReduceCombiner combine(1024 * 1024, kMaxCombineCount);
+  ASSERT_EQ(AllReduceCount(*module), 6);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, combine.Run(module.get()));
+  EXPECT_EQ(AllReduceCount(*module), 4);
+  EXPECT_TRUE(changed);
+
+  auto crs0 = op::AllReduce(op::Parameter(0), op::AllReduce(op::Parameter(1)));
+  auto add = op::Add(op::AllReduce(op::GetTupleElement(crs0, 0)),
+                     op::GetTupleElement(crs0, 1));
+  auto crs1 = op::AllReduce(add, op::GetTupleElement(crs0));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Tuple(op::GetTupleElement(crs1, 0), op::GetTupleElement(crs1, 1)));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/collective_ops_utils.h b/tensorflow/compiler/xla/service/collective_ops_utils.h
index 8b3c60f76de..2524b4190e9 100644
--- a/tensorflow/compiler/xla/service/collective_ops_utils.h
+++ b/tensorflow/compiler/xla/service/collective_ops_utils.h
@@ -149,7 +149,6 @@ struct AllReduceParticipantData {
   explicit AllReduceParticipantData(RendezvousKey rendezvous_key)
       : rendezvous_key(rendezvous_key) {}
 
-  int64 element_count;
   int64 device_ordinal;
   RendezvousKey rendezvous_key;
 
@@ -157,20 +156,30 @@ struct AllReduceParticipantData {
   // source_buffer == destination_buffer if that avoids a NCCL copy (will depend
   // on how well the NCCL in-place implementation performs vs the out-of-place
   // implementation).
-  se::DeviceMemoryBase source_data;
-  se::DeviceMemoryBase destination_data;
+  struct Buffer {
+    int64 element_count;
+    se::DeviceMemoryBase source_data;
+    se::DeviceMemoryBase destination_data;
+    PrimitiveType primitive_type;
+  };
+  std::vector<Buffer> buffers;
   se::Stream* stream;
 
   ReductionKind reduction_kind;
-  PrimitiveType primitive_type;
 
   int num_participants() const { return rendezvous_key.num_participants(); }
 
   string ToString() const {
+    std::vector<std::string> buffer_strs;
+    for (const Buffer& buffer : buffers) {
+      buffer_strs.push_back(
+          absl::StrFormat("{element_count=%d}", buffer.element_count));
+    }
     return absl::StrFormat(
-        "AllReduceParticipantData{element_count=%d, rendezvous_key=%s, "
+        "AllReduceParticipantData{buffers=[%s], rendezvous_key=%s, "
         "device_ordinal=%d, stream=%p}",
-        element_count, rendezvous_key.ToString(), device_ordinal, stream);
+        absl::StrJoin(buffer_strs, ","), rendezvous_key.ToString(),
+        device_ordinal, stream);
   }
 };
 
@@ -245,7 +254,7 @@ class Rendezvous {
 
       // Spot check for consistent replica counts among submitting threads.
       if (!participants_.empty() &&
-          (participants_.back().element_count != participant.element_count ||
+          (participants_.back().buffers.size() != participant.buffers.size() ||
            participants_.back().rendezvous_key != participant.rendezvous_key)) {
         return InvalidArgument(
             "Mismatch among all-reduce participants.  Expected same "
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 56d663f7b24..98c23b679fa 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -262,7 +262,8 @@ class CpuAllReduceRendezvous : public xla::Rendezvous<std::nullptr_t> {
  protected:
   xla::StatusOr<std::pair<std::nullptr_t, bool>> SubmitParticipantImpl(
       xla::AllReduceParticipantData participant) override {
-    xla::PrimitiveType datatype = participant.primitive_type;
+    TF_RET_CHECK(participant.buffers.size() == 1);
+    xla::PrimitiveType datatype = participant.buffers.front().primitive_type;
     bool primary = [&] {
       tensorflow::mutex_lock lock(mu_);
       if (!initialized_) {
@@ -316,10 +317,8 @@ class CpuAllReduceRendezvous : public xla::Rendezvous<std::nullptr_t> {
     using T = typename xla::primitive_util::PrimitiveTypeToNative<PT>::type;
     tensorflow::mutex_lock lock(mu_);
     CHECK(!participants_.empty());
-    xla::int64 element_count = participant.element_count;
     xla::ReductionKind reduction_kind = participant.reduction_kind;
     for (const auto& p : participants_) {
-      CHECK_EQ(p.element_count, element_count);
       CHECK(p.reduction_kind == reduction_kind);
     }
 
@@ -329,11 +328,19 @@ class CpuAllReduceRendezvous : public xla::Rendezvous<std::nullptr_t> {
     output_buffers.reserve(participants_.size());
 
     for (auto& p : participants_) {
-      input_buffers.emplace_back(static_cast<T*>(p.source_data.opaque()),
-                                 element_count);
-      output_buffers.emplace_back(static_cast<T*>(p.destination_data.opaque()),
-                                  element_count);
+      CHECK_EQ(p.buffers.size(), 1);
+      CHECK_EQ(p.buffers.front().element_count,
+               participants_.front().buffers.front().element_count);
+      xla::int64 element_count = participant.buffers.front().element_count;
+      input_buffers.emplace_back(
+          static_cast<T*>(p.buffers.front().source_data.opaque()),
+          element_count);
+      output_buffers.emplace_back(
+          static_cast<T*>(p.buffers.front().destination_data.opaque()),
+          element_count);
     }
+    xla::int64 element_count =
+        participants_.front().buffers.front().element_count;
 
     auto compute = [reduction_kind](T a, T b) -> T {
       switch (reduction_kind) {
@@ -416,7 +423,6 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_AllReduce(
   xla::RendezvousKey rendezvous_key(run_options->run_id(),
                                     participating_replicas_vec, op_kind, op_id);
 
-
   auto shape_str = ShapeString(shape_ptr, shape_length);
   VLOG(2) << "All-reduce input/output shape : " << shape_str;
 
@@ -426,14 +432,16 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_AllReduce(
       << "All-reduce on CPU is implemented only for dense arrays";
 
   xla::AllReduceParticipantData participant(rendezvous_key);
-  participant.element_count = xla::ShapeUtil::ElementsIn(shape);
   participant.device_ordinal = device_ordinal;
-  participant.primitive_type = shape.element_type();
   participant.stream = run_options->stream();
-  participant.source_data =
+  xla::AllReduceParticipantData::Buffer buffer;
+  buffer.element_count = xla::ShapeUtil::ElementsIn(shape);
+  buffer.primitive_type = shape.element_type();
+  buffer.source_data =
       se::DeviceMemoryBase(input_buffer, xla::ShapeUtil::ByteSizeOf(shape));
-  participant.destination_data =
+  buffer.destination_data =
       se::DeviceMemoryBase(output_buffer, xla::ShapeUtil::ByteSizeOf(shape));
+  participant.buffers = {buffer};
   participant.reduction_kind = static_cast<xla::ReductionKind>(reduction_kind);
 
   TF_CHECK_OK(
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 28e33b2a17e..d13eca30cdc 100755
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -1131,6 +1131,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
+        "//tensorflow/compiler/xla/service:all_reduce_combiner",
         "//tensorflow/compiler/xla/service:batchnorm_expander",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:call_inliner",
diff --git a/tensorflow/compiler/xla/service/gpu/dummy_all_reduce_thunk.cc b/tensorflow/compiler/xla/service/gpu/dummy_all_reduce_thunk.cc
index 8e562387aac..7c3d76c1c92 100644
--- a/tensorflow/compiler/xla/service/gpu/dummy_all_reduce_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/dummy_all_reduce_thunk.cc
@@ -42,15 +42,11 @@ NcclAllReduceThunk::DevicesWithOpenNcclChannels() {
 struct NcclAllReduceThunk::AuxData {};
 
 NcclAllReduceThunk::NcclAllReduceThunk(
-    int64 replica_count, int64 element_count,
-    const BufferAllocation::Slice& source_buffer,
-    const BufferAllocation::Slice& destination_buffer,
+    int64 replica_count, std::vector<NcclAllReduceThunk::Buffer> buffers,
     const HloInstruction* all_reduce)
     : Thunk(Thunk::kNcclAllReduce, all_reduce),
       replica_count_(replica_count),
-      element_count_(element_count),
-      source_buffer_(source_buffer),
-      destination_buffer_(destination_buffer) {}
+      buffers_(std::move(buffers)) {}
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index bccf13b6104..e4c57203543 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "llvm/IR/Verifier.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
+#include "tensorflow/compiler/xla/service/all_reduce_combiner.h"
 #include "tensorflow/compiler/xla/service/batchnorm_expander.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
@@ -291,7 +292,13 @@ Status GpuCompiler::OptimizeHloModule(
     horizontal_fusion.AddPass<HloDCE>();
     TF_RETURN_IF_ERROR(horizontal_fusion.Run(hlo_module).status());
   }
-
+  {
+    HloPassPipeline pipeline("all_reduce_combiner");
+    pipeline.AddPass<AllReduceCombiner>(
+        /*combine_threshold_in_bytes=*/30 * 1024 * 1024,
+        /*combine_threshold_count=*/256);
+    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index c6b167f7402..8efcd2384a3 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1210,10 +1210,7 @@ Status IrEmitterUnnested::HandleCollectivePermute(HloInstruction* hlo) {
   return Status::OK();
 }
 
-namespace {
-
-
-}  // namespace
+namespace {}  // namespace
 
 Status IrEmitterUnnested::HandleAllReduce(HloInstruction* crs) {
   VLOG(2) << "AllReduce; replica count: " << hlo_module_config_.replica_count()
@@ -1226,13 +1223,37 @@ Status IrEmitterUnnested::HandleAllReduce(HloInstruction* crs) {
                                NcclAllReduceThunk::CanImplement(crs);
 
   if (should_use_nccl_thunk) {
-    CHECK(crs->operand(0)->shape().IsArray())
-        << "Operands to all-reduce must be arrays: " << crs->ToString();
-    AddThunkToThunkSequence(absl::make_unique<NcclAllReduceThunk>(
+    std::vector<NcclAllReduceThunk::Buffer> buffers;
+    std::vector<BufferAllocation::Slice> tuple_element_buffers;
+    buffers.resize(crs->operand_count());
+    tuple_element_buffers.reserve(crs->operand_count());
+    CHECK(crs->shape().IsArray() && crs->operand_count() == 1 ||
+          crs->shape().IsTuple() &&
+              crs->shape().tuple_shapes_size() == crs->operand_count());
+    for (int i = 0; i < crs->operand_count(); ++i) {
+      CHECK(crs->operand(i)->shape().IsArray())
+          << "Operands to all-reduce must be arrays: " << crs->ToString();
+      buffers[i].element_count =
+          ShapeUtil::ElementsIn(crs->operand(i)->shape());
+      buffers[i].source_buffer = GetAllocationSlice(*crs->operand(i));
+      buffers[i].destination_buffer = GetAllocationSlice(
+          *crs, crs->shape().IsTuple() ? ShapeIndex({i}) : ShapeIndex({}));
+      tuple_element_buffers.push_back(buffers[i].destination_buffer);
+    }
+    auto all_reduce_thunk = absl::make_unique<NcclAllReduceThunk>(
         /*replica_count=*/hlo_module_config_.replica_count(),
-        /*elements=*/ShapeUtil::ElementsIn(crs->operand(0)->shape()),
-        /*source_address=*/GetAllocationSlice(*crs->operand(0)),
-        /*destination_buffer=*/GetAllocationSlice(*crs), crs));
+        /*buffers=*/std::move(buffers), crs);
+    if (crs->shape().IsTuple()) {
+      std::vector<std::unique_ptr<Thunk>> thunks;
+      thunks.push_back(std::move(all_reduce_thunk));
+      thunks.push_back(absl::make_unique<TupleThunk>(
+          tuple_element_buffers, GetAllocationSlice(*crs), nullptr));
+      AddThunkToThunkSequence(
+          absl::make_unique<SequentialThunk>(std::move(thunks), crs));
+    } else {
+      AddThunkToThunkSequence(std::move(all_reduce_thunk));
+    }
+
     return Status::OK();
   }
 
@@ -1957,32 +1978,32 @@ void IrEmitterUnnested::EmitTile(
   //
   // TODO(cheshire): Once ptxas is fixed and TF switches to it, remove the
   // workaround.
-  ksl->For(
-      loop_name + "_y_in_tile",
-      /*start=*/constant(0),
-      /*end=*/
-      ceil_of_ratio(b_.CreateSub(tile_height, thread_id_info.thread_id_y),
-                    num_threads_y),
-      /*step=*/constant(1), [&](llvm::Value* y_indvar) {
-        llvm::Value* y_loc = b_.CreateAdd(
-            thread_id_info.thread_id_y, b_.CreateMul(y_indvar, num_threads_y));
-        for (int64 j = 0; j < x_num_steps; j++) {
-          llvm::Value* x_loc =
-              b_.CreateAdd(constant(j * step_x), start_offset_x, "x_loc");
-          IrArray::Index source_idx_x =
-              source_idx.AddOffsetToDim(y_loc, kDimY, &b_)
-                  .AddOffsetToDim(constant(j * step_x), kDimX, &b_);
-          auto emit_element = [&] {
-            return emit_elem_function(source_idx_x, y_loc, x_loc, j);
-          };
-          if (!x_tile_fits) {
-            ksl->If(loop_name + "_x_in_tile",
-                    b_.CreateICmpULT(x_loc, tile_width), emit_element);
-          } else {
-            emit_element();
-          }
-        }
-      });
+  ksl->For(loop_name + "_y_in_tile",
+           /*start=*/constant(0),
+           /*end=*/
+           ceil_of_ratio(b_.CreateSub(tile_height, thread_id_info.thread_id_y),
+                         num_threads_y),
+           /*step=*/constant(1), [&](llvm::Value* y_indvar) {
+             llvm::Value* y_loc =
+                 b_.CreateAdd(thread_id_info.thread_id_y,
+                              b_.CreateMul(y_indvar, num_threads_y));
+             for (int64 j = 0; j < x_num_steps; j++) {
+               llvm::Value* x_loc =
+                   b_.CreateAdd(constant(j * step_x), start_offset_x, "x_loc");
+               IrArray::Index source_idx_x =
+                   source_idx.AddOffsetToDim(y_loc, kDimY, &b_)
+                       .AddOffsetToDim(constant(j * step_x), kDimX, &b_);
+               auto emit_element = [&] {
+                 return emit_elem_function(source_idx_x, y_loc, x_loc, j);
+               };
+               if (!x_tile_fits) {
+                 ksl->If(loop_name + "_x_in_tile",
+                         b_.CreateICmpULT(x_loc, tile_width), emit_element);
+               } else {
+                 emit_element();
+               }
+             }
+           });
 }
 
 // Emits code to process a tensor element in a tile for the given kCopy HLO that
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
index 9b2662a9a05..4498793113a 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
@@ -154,10 +154,6 @@ ncclRedOp_t ReductionKindToNccl(ReductionKind kind) {
   }
 }
 
-PrimitiveType AllReducePrimitiveType(const HloInstruction* instr) {
-  return instr->operand(0)->shape().element_type();
-}
-
 absl::optional<ncclDataType_t> DatatypeToNccl(PrimitiveType element_type) {
   switch (element_type) {
     case S8:
@@ -402,9 +398,6 @@ RendezvousNcclAllReduce::SubmitParticipantImpl(
   VLOG(3) << "Performing all reduce from device ordinal: "
           << participant.device_ordinal;
   ncclRedOp_t computation = ReductionKindToNccl(participant.reduction_kind);
-  absl::optional<ncclDataType_t> allreduce_datatype =
-      DatatypeToNccl(participant.primitive_type);
-  CHECK(allreduce_datatype.has_value());
 
   se::StreamExecutor* executor = participant.stream->parent();
   se::cuda::ScopedActivateExecutorContext scoped_context(executor);
@@ -412,19 +405,26 @@ RendezvousNcclAllReduce::SubmitParticipantImpl(
       participant.stream->implementation()->GpuStreamMemberHack());
   VLOG(3) << "Using stream pointer: " << cu_stream
           << " on device: " << participant.device_ordinal;
-  void* send_buffer = participant.source_data.opaque();
-  void* recv_buffer = participant.destination_data.opaque();
-  VLOG(3) << absl::StreamFormat(
-      "Calling ncclAllReduce(send_buffer=%p, recv_buffer=%p, count=%d, "
-      "comm=%p, stream=%p)",
-      send_buffer, recv_buffer, participant.element_count,
-      static_cast<const void*>(comm), cu_stream);
-  XLA_CUDA_RETURN_IF_ERROR(ncclAllReduce(send_buffer, recv_buffer,
-                                         /*count=*/participant.element_count,
-                                         /*datatype=*/*allreduce_datatype,
-                                         /*op=*/computation,
-                                         /*comm=*/comm,
-                                         /*stream=*/*cu_stream));
+  XLA_CUDA_RETURN_IF_ERROR(ncclGroupStart());
+  for (auto& buffer : participant.buffers) {
+    void* send_buffer = buffer.source_data.opaque();
+    void* recv_buffer = buffer.destination_data.opaque();
+    absl::optional<ncclDataType_t> allreduce_datatype =
+        DatatypeToNccl(buffer.primitive_type);
+    CHECK(allreduce_datatype.has_value());
+    VLOG(3) << absl::StreamFormat(
+        "Calling ncclAllReduce(send_buffer=%p, recv_buffer=%p, count=%d, "
+        "comm=%p, stream=%p)",
+        send_buffer, recv_buffer, buffer.element_count,
+        static_cast<const void*>(comm), cu_stream);
+    XLA_CUDA_RETURN_IF_ERROR(ncclAllReduce(send_buffer, recv_buffer,
+                                           /*count=*/buffer.element_count,
+                                           /*datatype=*/*allreduce_datatype,
+                                           /*op=*/computation,
+                                           /*comm=*/comm,
+                                           /*stream=*/*cu_stream));
+  }
+  XLA_CUDA_RETURN_IF_ERROR(ncclGroupEnd());
 
   VLOG(3) << "Done performing all reduce for ordinal: "
           << participant.device_ordinal;
@@ -453,11 +453,14 @@ struct NcclAllReduceThunk::AuxData {
 };
 
 /*static*/ bool NcclAllReduceThunk::CanImplement(const HloInstruction* crs) {
+  auto operands_are_supported = [crs]() {
+    return absl::c_all_of(crs->operands(), [](HloInstruction* operand) {
+      return LayoutUtil::IsDenseArray(operand->shape()) &&
+             DatatypeToNccl(operand->shape().element_type()).has_value();
+    });
+  };
   return MatchReductionComputation(crs->to_apply()).has_value() &&
-         DatatypeToNccl(AllReducePrimitiveType(crs)).has_value() &&
-         crs->IsCrossReplicaAllReduce() &&
-         crs->operand_count() == 1 &&  // One array to reduce.
-         LayoutUtil::IsDenseArray(crs->operand(0)->shape());
+         crs->IsCrossReplicaAllReduce() && operands_are_supported();
 }
 
 /*static*/ absl::flat_hash_set<int>
@@ -471,16 +474,14 @@ NcclAllReduceThunk::DevicesWithOpenNcclChannels() {
 }
 
 NcclAllReduceThunk::NcclAllReduceThunk(
-    int64 replica_count, int64 element_count,
-    const BufferAllocation::Slice& source_buffer,
-    const BufferAllocation::Slice& destination_buffer,
+    int64 replica_count, std::vector<NcclAllReduceThunk::Buffer> buffers,
     const HloInstruction* all_reduce)
     : Thunk(Thunk::kNcclAllReduce, all_reduce),
       replica_count_(replica_count),
-      element_count_(element_count),
-      source_buffer_(source_buffer),
-      destination_buffer_(destination_buffer),
-      aux_data_(absl::make_unique<AuxData>()) {}
+      buffers_(std::move(buffers)),
+      aux_data_(absl::make_unique<AuxData>()) {
+  CHECK_EQ(hlo_instruction()->operand_count(), buffers_.size());
+}
 
 // Figures out which devices (named by their replica-ids) are participating in
 // the all-reduce subgroup that contains device_ordinal.
@@ -506,18 +507,24 @@ Status NcclAllReduceThunk::ExecuteOnStream(const ExecuteParams& params) {
           << absl::StrJoin(participating_replicas, ", ");
 
   AllReduceParticipantData participant(rendezvous_key);
-  participant.element_count = element_count_;
   participant.device_ordinal = device_ordinal;
-  participant.source_data =
-      params.buffer_allocations->GetDeviceAddress(source_buffer_);
-  participant.destination_data =
-      params.buffer_allocations->GetDeviceAddress(destination_buffer_);
+  for (size_t i = 0; i < buffers_.size(); ++i) {
+    const NcclAllReduceThunk::Buffer& buffer = buffers_[i];
+    AllReduceParticipantData::Buffer pbuffer;
+    pbuffer.element_count = buffer.element_count;
+    pbuffer.source_data =
+        params.buffer_allocations->GetDeviceAddress(buffer.source_buffer);
+    pbuffer.destination_data =
+        params.buffer_allocations->GetDeviceAddress(buffer.destination_buffer);
+    pbuffer.primitive_type =
+        hlo_instruction()->operand(i)->shape().element_type();
+    participant.buffers.push_back(pbuffer);
+  }
   participant.stream = params.stream;
   auto reduction_kind =
       MatchReductionComputation(hlo_instruction()->to_apply());
   CHECK(reduction_kind.has_value());
   participant.reduction_kind = *reduction_kind;
-  participant.primitive_type = AllReducePrimitiveType(hlo_instruction());
 
   TF_ASSIGN_OR_RETURN(
       std::shared_ptr<NcclClique> clique,
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
index 36b757ae567..7633a99794f 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
@@ -50,9 +50,12 @@ class NcclAllReduceThunk : public Thunk {
 
   // TODO(b/125951860): Support all-reduces with replica groups, i.e.
   // all-reduces that compute multiple sums across subsets of all replicas.
-  NcclAllReduceThunk(int64 replica_count, int64 element_count,
-                     const BufferAllocation::Slice& source_buffer,
-                     const BufferAllocation::Slice& destination_buffer,
+  struct Buffer {
+    int64 element_count;
+    BufferAllocation::Slice source_buffer;
+    BufferAllocation::Slice destination_buffer;
+  };
+  NcclAllReduceThunk(int64 replica_count, std::vector<Buffer> buffers,
                      const HloInstruction* all_reduce);
   ~NcclAllReduceThunk() override;
 
@@ -70,9 +73,7 @@ class NcclAllReduceThunk : public Thunk {
   struct AuxData;
 
   const int64 replica_count_;
-  const int64 element_count_;
-  const BufferAllocation::Slice source_buffer_;
-  const BufferAllocation::Slice destination_buffer_;
+  const std::vector<Buffer> buffers_;
   std::unique_ptr<AuxData> aux_data_;
 };
 
diff --git a/tensorflow/compiler/xla/tests/collective_ops_test.cc b/tensorflow/compiler/xla/tests/collective_ops_test.cc
index 56c5f688312..5cdf9633ca4 100644
--- a/tensorflow/compiler/xla/tests/collective_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/collective_ops_test.cc
@@ -368,6 +368,55 @@ XLA_TEST_F(CollectiveOpsTest, AllReduce_ManyConcurrentAllReduces) {
   done.Wait();
 }
 
+// Runs the same executable many times concurrently.  The all-reduces should not
+// conflict with one another.
+XLA_TEST_F(CollectiveOpsTest, AllReduce_CombinableAllReduces) {
+  std::string hlo_string = R"(
+    HloModule test
+
+    apply_op {
+      x = f32[] parameter(0)
+      y = f32[] parameter(1)
+      ROOT apply_op = f32[] add(x, y)
+    }
+
+    ENTRY test_computation {
+      p0 = f32[5] parameter(0)
+      p1 = f32[5] parameter(1)
+      crs0 = f32[5] all-reduce(p0), replica_groups={}, to_apply=apply_op
+      crs1 = f32[5] all-reduce(p1), replica_groups={}, to_apply=apply_op
+      ROOT out = (f32[5], f32[5]) tuple(f32[5] crs0, f32[5] crs1)
+    }
+  )";
+  static constexpr int kNumReplicas = 2;
+  auto config = GetModuleConfigForTest();
+  config.set_replica_count(kNumReplicas);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string, config));
+
+  std::vector<float> input0_vec = {1., 2., 3., 4., 5.};
+  auto input0_literal = LiteralUtil::CreateR1<float>(input0_vec);
+  std::vector<float> input1_vec = {7., 3., 4., 1., 2.};
+  auto input1_literal = LiteralUtil::CreateR1<float>(input1_vec);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), {&input0_literal, &input1_literal},
+                        /*num_replicas=*/kNumReplicas,
+                        /*use_threads=*/true));
+  std::vector<float> expected0_vec = {2., 4., 6., 8., 10.};
+  auto expected0_literal = LiteralUtil::CreateR1<float>(expected0_vec);
+  std::vector<float> expected1_vec = {14., 6., 8., 2., 4.};
+  auto expected1_literal = LiteralUtil::CreateR1<float>(expected1_vec);
+  for (int replica_idx = 0; replica_idx < kNumReplicas; replica_idx++) {
+    auto rs = results[replica_idx].DecomposeTuple();
+    EXPECT_TRUE(LiteralTestUtil::NearOrEqual(expected0_literal, rs[0],
+                                             ErrorSpec{1e-5, 1e-5}));
+    EXPECT_TRUE(LiteralTestUtil::NearOrEqual(expected1_literal, rs[1],
+                                             ErrorSpec{1e-5, 1e-5}));
+  }
+}
+
 // Runs an all-reduce with three partitions:
 //  {0}, {1,2}, {3}
 // meaning, the all-reduce is a nop for devices 0 and 3, and only devices 1 and

From 1f8af07856d6788899db4e8396ebc56c99271cda Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Wed, 19 Feb 2020 22:48:14 -0800
Subject: [PATCH 329/442] In verify ophint extraction: instead of error out, we
 should just don't do anything.

PiperOrigin-RevId: 296135220
Change-Id: Id67e57859bc73ba13c4844e73691fcdff37894c1
---
 tensorflow/compiler/mlir/lite/tests/extract-ophint.mlir  | 9 +++++++--
 .../compiler/mlir/lite/transforms/extract_ophint.cc      | 5 ++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/tests/extract-ophint.mlir b/tensorflow/compiler/mlir/lite/tests/extract-ophint.mlir
index bde800897c5..a18ba9cd91a 100644
--- a/tensorflow/compiler/mlir/lite/tests/extract-ophint.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/extract-ophint.mlir
@@ -178,15 +178,20 @@ func @inputsAfterOutputs() {
 
 // -----
 
-// expected-error@+1 {{Found malformed ophint regions: missing inputs or outputs.}}
 module {
-func @extractOphintFailure() {
+func @extractOphintSame() {
   %0 = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x16x1xf32>
   %1 = call @AnotherFunc(%0) : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
   %2 = "tf.Sigmoid"(%1) {T = "tfdtype$DT_FLOAT", name = "Sigmoid"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
   %3 = "tf.Mul"(%2, %1) {T = "tfdtype$DT_FLOAT", name = "mul"} : (tensor<1x16x16x1xf32>, tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
   %4 = "tf.Identity"(%3) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "d4b1eb00b81211e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation-d4b1eb00b81211e99426dc4a3e957995-0-None-None"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
   return
+
+// CHECK:    [[VAL_0:%.*]] = "tf.Placeholder"() {dtype = "tfdtype$DT_FLOAT", name = "Placeholder", shape = "tfshape$dim { size: 1 } dim { size: 16 } dim { size: 16 } dim { size: 1 }"} : () -> tensor<1x16x16x1xf32>
+// CHECK:    [[VAL_1:%.*]] = call @AnotherFunc([[VAL_0]]) : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
+// CHECK:    [[VAL_2:%.*]] = "tf.Sigmoid"([[VAL_1]]) {T = "tfdtype$DT_FLOAT", name = "Sigmoid"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
+// CHECK:    [[VAL_3:%.*]] = "tf.Mul"([[VAL_2]], [[VAL_1]]) {T = "tfdtype$DT_FLOAT", name = "mul"} : (tensor<1x16x16x1xf32>, tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
+// CHECK:    [[VAL_4:%.*]] = "tf.Identity"([[VAL_3]]) {T = "tfdtype$DT_FLOAT", _tflite_function_name = "cool_activation", _tflite_function_output_index = 0 : i64, _tflite_function_uuid = "d4b1eb00b81211e99426dc4a3e957995", _tflite_ophint_level = 1 : i64, name = "OutputHint-cool_activation-d4b1eb00b81211e99426dc4a3e957995-0-None-None"} : (tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32>
 }
 
 func @AnotherFunc(%arg0: tensor<1x16x16x1xf32>) -> tensor<1x16x16x1xf32> {
diff --git a/tensorflow/compiler/mlir/lite/transforms/extract_ophint.cc b/tensorflow/compiler/mlir/lite/transforms/extract_ophint.cc
index 7aab9f08732..e07cea8535e 100644
--- a/tensorflow/compiler/mlir/lite/transforms/extract_ophint.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/extract_ophint.cc
@@ -698,11 +698,10 @@ void ExtractOphintPass::runOnModule() {
       if (ophint_composite_ops.empty()) continue;
 
       // Verify: Make sure all ophint_composite_ops are valid.
+      // If not valid, we just don't do anything.
       for (const auto& kv : ophint_composite_ops) {
         if (failed(kv.getValue().VerifyOphint())) {
-          module.emitError()
-              << "Found malformed ophint regions: missing inputs or outputs.";
-          return signalPassFailure();
+          return;
         }
       }
 

From acfe12fe6a317509bff143a8219863a9d65f2b78 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 22:49:55 -0800
Subject: [PATCH 330/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 296135428 Change-Id:
 Icbe9c3556c2c8bcdf0328e932ff73eeed95dd2da

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index ecdce1e627b..449a95765a5 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21345,7 +21345,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22053,7 +22053,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22249,7 +22249,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22318,7 +22318,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22433,7 +22433,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22492,7 +22492,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22666,7 +22666,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22857,7 +22857,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25297,7 +25297,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25629,7 +25629,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25679,7 +25679,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25929,7 +25929,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26559,7 +26559,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27624,7 +27624,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45536,7 +45536,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 05fe36c9bc89c41f3f5a7903cef430ed92b55d81 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 19 Feb 2020 23:12:07 -0800
Subject: [PATCH 331/442] Add pfor converter for "If"

PiperOrigin-RevId: 296138673
Change-Id: Ib63ecece09d8d1df4e53c69dae0b511aaba6c120
---
 .../ops/parallel_for/control_flow_ops_test.py | 21 +++++++++++++++++++
 tensorflow/python/ops/parallel_for/pfor.py    |  7 +++++++
 2 files changed, 28 insertions(+)

diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 65cbdbe4503..2d8dfcfe696 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -52,6 +52,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import rnn
 from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import stateless_random_ops
@@ -1264,6 +1265,26 @@ class StatelessIfTest(PForTestCase):
     self._test_loop_fn(loop_fn, iters=5)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+@test_util.with_control_flow_v2
+class IfTest(PForTestCase):
+
+  def test_read_var(self):
+    x = [1, 2, 3, 4, 5.]
+    y = 2.5
+    z = resource_variable_ops.ResourceVariable(5.)
+
+    @def_function.function
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      return cond_v2.cond_v2(
+          x_i < y,
+          lambda: z - x_i,
+          lambda: z + x_i)
+
+    self._test_loop_fn(loop_fn, iters=5)
+
+
 class RNNTest(PForTestCase):
 
   @test_util.run_v1_only("b/122612051")
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index 88f31210ddb..556de0525bf 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -3665,6 +3665,7 @@ def _outputs_for_branch(func_name, indices, pfor_input, inputs):
 
 
 @RegisterPFor("StatelessIf")
+@RegisterPFor("If")
 def _convert_stateless_if(pfor_input):
   cond, cond_stacked, _ = pfor_input.input(0)
   inputs = pfor_input.inputs[1:]
@@ -3695,6 +3696,12 @@ def _convert_stateless_if(pfor_input):
                                        pfor_input, else_inputs)
 
     assert len(then_outputs) == len(else_outputs)
+    # Note that if the "then" and "else" branches are updating the same state,
+    # and possibly reading them as well, it could lead to undefined behavior
+    # since the ordering of those operations is not well defined.
+    # One possibility is to order all the "then" branches to execute before all
+    # the "else" branches so that the side-effects in the former are visible to
+    # the latter. For now, we leave that as undefined behavior.
     outputs = []
     # Merge outputs
     for then_output, else_output in zip(then_outputs, else_outputs):

From c4af1e338195759c8a6f72442cdf7ae9a8977210 Mon Sep 17 00:00:00 2001
From: Srinivas Vasudevan <srvasude@google.com>
Date: Wed, 19 Feb 2020 23:14:01 -0800
Subject: [PATCH 332/442] Add test for Expm1 for small parameter regime of
 complex numbers.

PiperOrigin-RevId: 296138822
Change-Id: Idf7ff8e34acb056bda59002f54b6d3df7c42ba5a
---
 tensorflow/compiler/tests/unary_ops_test.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index c3ecc1c6215..a0aea950cde 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -587,6 +587,26 @@ class UnaryOpsTest(xla_test.XLATestCase):
           rtol=1e-6,
           atol=1e-6)
 
+      # For real part close to zero, or imaginary part close to a multiple of
+      # pi.
+
+      self._assertOpOutputMatchesExpected(
+          math_ops.expm1,
+          np.array([[1e-11 + 1j, -1e-11 - 1j, 1. + 1e-11j,
+                     -1. - 1e-11j, 1e-13j + 1e-13j]], dtype=dtype),
+          # TODO(srvasude): Use numpy as the source of truth after we depend on
+          # latest numpy with this pull request:
+          # https://github.com/numpy/numpy/pull/15110.
+          # The numbers below were generated by scipy.special.expm1.
+          expected=np.array([[
+              -4.59697694e-01+8.41470985e-01j,
+              -4.59697694e-01-8.41470985e-01j,
+              1.71828183e+00+2.71828183e-11j,
+              -6.32120559e-01-3.67879441e-12j,
+              -2.00000000e-26+2.00000000e-13j]], dtype=dtype),
+          rtol=1e-09,
+          atol=1e-20)
+
       self._assertOpOutputMatchesExpected(
           math_ops.reciprocal,
           np.array([[1, 2j, 2 + 3j]], dtype=dtype),

From d027ba19642ed498c03dd59e92f422c64fc6644e Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Wed, 19 Feb 2020 23:18:16 -0800
Subject: [PATCH 333/442] Use an AbstractOperationInterface in TFE_Op

This allows us to move towards cleaning up some of the header
dependencies in incurred by c_api_internal.h.

PiperOrigin-RevId: 296139196
Change-Id: I02be01d98ad06af8f1f2a8fc1f067849336a0c26
---
 tensorflow/c/c_api_experimental.cc            |  10 +-
 tensorflow/c/eager/BUILD                      |   6 +
 tensorflow/c/eager/c_api.cc                   | 267 ++++++---------
 tensorflow/c/eager/c_api_experimental.cc      |  11 +-
 tensorflow/c/eager/c_api_internal.h           |   4 +-
 tensorflow/c/eager/c_api_test.cc              |  56 +--
 tensorflow/c/eager/operation_interface.cc     | 319 ++++++++++++++++++
 tensorflow/c/eager/operation_interface.h      | 192 +++++++++++
 .../core/common_runtime/eager/attr_builder.cc |   2 +-
 .../core/common_runtime/eager/attr_builder.h  |   2 +-
 .../common_runtime/eager/eager_operation.h    |   1 +
 .../common_runtime/eager/kernel_and_device.cc |   2 +-
 tensorflow/python/eager/pywrap_tfe_src.cc     |  10 +-
 13 files changed, 666 insertions(+), 216 deletions(-)
 create mode 100644 tensorflow/c/eager/operation_interface.cc
 create mode 100644 tensorflow/c/eager/operation_interface.h

diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index c11ef3756d5..4e7ba3943ae 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/casts.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/net.h"
 #include "tensorflow/core/platform/platform.h"
@@ -816,12 +817,15 @@ void TFE_InferShapes(TFE_Op* tfe_op, TF_ShapeAndTypeList* input_shapes,
 
   const int num_inputs = input_shapes->num_items;
   NodeDef node_def;
-  node_def.set_name(tfe_op->operation.Name());
-  node_def.set_op(tfe_op->operation.Name());
+  node_def.set_name(tfe_op->operation->Name());
+  node_def.set_op(tfe_op->operation->Name());
   for (int i = 0; i < num_inputs; ++i) {
     node_def.add_input("dummy_input");
   }
-  tfe_op->operation.Attrs().FillAttrValueMap(node_def.mutable_attr());
+  tensorflow::down_cast<tensorflow::OperationInterface*>(
+      tfe_op->operation.get())
+      ->Attrs()
+      .FillAttrValueMap(node_def.mutable_attr());
 
   const tensorflow::OpRegistrationData* op_reg_data;
   status->status =
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 5901ddb6182..3a6c2eef1fe 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -28,6 +28,8 @@ tf_cuda_library(
         "c_api_debug.cc",
         "c_api_experimental.h",
         "c_api_internal.h",
+        "operation_interface.cc",
+        "operation_interface.h",
         "tensor_handle_interface.h",
     ],
     hdrs = ["c_api.h"],
@@ -56,6 +58,7 @@ tf_cuda_library(
             "//tensorflow/core:framework_internal",
             "//tensorflow/core:lib",
             "//tensorflow/core:lib_internal",
+            "//tensorflow/core/platform:casts",
             "//tensorflow/core/platform:errors",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core/profiler/lib:traceme",
@@ -92,6 +95,7 @@ filegroup(
     srcs = [
         "c_api_experimental.h",
         "c_api_internal.h",
+        "operation_interface.h",
         "tensor_handle_interface.h",
     ],
     visibility = [
@@ -104,6 +108,7 @@ tf_cuda_library(
     name = "c_api_internal",
     srcs = [
         "c_api_experimental.h",
+        "operation_interface.h",
         "tensor_handle_interface.h",
     ],
     hdrs = ["c_api_internal.h"],
@@ -128,6 +133,7 @@ tf_cuda_library(
         "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/common_runtime/eager:kernel_and_device",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "@com_google_absl//absl/container:fixed_array",
     ],
 )
 
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 4fa6ed64a2f..6e2b24502c7 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -27,7 +27,6 @@ limitations under the License.
 // clang-format on
 
 #include "absl/algorithm/container.h"
-#include "absl/container/fixed_array.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
@@ -95,14 +94,6 @@ using tensorflow::string;
 
 namespace {
 
-const tensorflow::OpDef* GetOpDef(TFE_Op* op, TF_Status* status) {
-  const tensorflow::OpDef* op_def = op->operation.OpDef();
-  if (op_def) return op_def;
-  status->status =
-      tensorflow::OpDefForOp(op->operation.Name().c_str(), &op_def);
-  return op_def;
-}
-
 bool IsCPU(
     absl::variant<tensorflow::Device*, tensorflow::CustomDevice*> variant) {
   if (VariantDeviceIsCustom(variant)) {
@@ -1253,9 +1244,8 @@ size_t TFE_TensorHandleDeviceMemorySize(TFE_TensorHandle* h,
 TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
                   TF_Status* status) {
   std::unique_ptr<TFE_Op> new_op(
-      new TFE_Op{tensorflow::EagerOperation(ctx->context)});
-  status->status =
-      new_op->operation.Reset(op_or_function_name, nullptr, false, nullptr);
+      new TFE_Op{std::make_unique<tensorflow::OperationInterface>(ctx)});
+  status->status = new_op->operation->Reset(op_or_function_name, nullptr);
   if (!status->status.ok()) {
     new_op.reset();
   }
@@ -1265,51 +1255,51 @@ TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
 void TFE_DeleteOp(TFE_Op* op) { delete op; }
 
 void TFE_OpSetDevice(TFE_Op* op, const char* device_name, TF_Status* status) {
-  status->status = op->operation.SetDeviceName(device_name);
+  status->status = op->operation->SetDeviceName(device_name);
 }
 
 const char* TFE_OpGetDevice(TFE_Op* op, TF_Status* status) {
-  absl::variant<tensorflow::Device*, tensorflow::CustomDevice*> variant_device =
-      (op->operation.Device() == tensorflow::kVariantDeviceNull)
-          ? op->operation.EagerContext().HostCPU()
-          : op->operation.Device();
-  return absl::visit([](auto* device) { return device->name().c_str(); },
-                     variant_device);
+  return op->operation->DeviceName().c_str();
 }
 
 void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) {
-  op->operation.SetUseXla(enable);
-#ifndef TENSORFLOW_EAGER_USE_XLA
+#ifdef TENSORFLOW_EAGER_USE_XLA
+  tensorflow::Status s = op->operation->SetUseXla(enable);
+  if (!s.ok()) {
+    LOG(ERROR) << "Could not enable XLA compilation for op: " << s;
+  }
+#else
   LOG(WARNING) << "This call is a no-op, as the TensorFlow library is not "
                   "built with XLA support.";
 #endif  // TENSORFLOW_EAGER_USE_XLA
 }
 
 void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* input, TF_Status* status) {
-  tensorflow::TensorHandle* h =
-      tensorflow::down_cast<tensorflow::TensorHandleInterface*>(
-          input->handle.get())
-          ->Handle();
-  op->operation.AddInput(h);
-  status->status = op->operation.MaybeInferSingleInputAttrs(h);
+  status->status = op->operation->AddInput(input->handle);
 }
 
 void TFE_OpAddInputList(TFE_Op* op, TFE_TensorHandle** inputs, int num_inputs,
                         TF_Status* status) {
+  absl::FixedArray<std::unique_ptr<AbstractTensorHandleInterface>> handles(
+      num_inputs);
   for (int i = 0; i < num_inputs; ++i) {
-    op->operation.AddInput(
-        tensorflow::down_cast<tensorflow::TensorHandleInterface*>(
-            inputs[i]->handle.get())
-            ->Handle());
+    handles[i].reset(inputs[i]->handle->Copy());
   }
-  status->status = op->operation.InferInputListAttrs(num_inputs);
+  status->status = op->operation->AddInputList(handles);
 }
 
 TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
                               unsigned char* is_list, TF_Status* status) {
   TF_AttrType ret = TF_ATTR_INT;
-  status->status = tensorflow::AttrTypeByName(*op->operation.AttrTypes(),
-                                              attr_name, &ret, is_list);
+  const tensorflow::AttrTypeMap* attr_types_;
+  bool is_function;
+  status->status = tensorflow::AttrTypeMapForOp(op->operation->Name().c_str(),
+                                                &attr_types_, &is_function);
+  if (!status->status.ok()) {
+    return ret;
+  }
+  status->status =
+      tensorflow::AttrTypeByName(*attr_types_, attr_name, &ret, is_list);
   return ret;
 }
 
@@ -1330,221 +1320,150 @@ TF_AttrType TFE_OpNameGetAttrType(TFE_Context* ctx,
 
 void TFE_OpSetAttrString(TFE_Op* op, const char* attr_name, const void* value,
                          size_t length) {
-  op->operation.MutableAttrs()->Set(
-      attr_name,
-      tensorflow::StringPiece(static_cast<const char*>(value), length));
+  auto s = op->operation->SetAttrString(
+      attr_name, static_cast<const char*>(value), length);
+  if (!s.ok()) {
+    LOG(WARNING) << "Unable to set attribute: " << attr_name;
+  }
 }
 
 void TFE_OpSetAttrInt(TFE_Op* op, const char* attr_name, int64_t value) {
-  op->operation.MutableAttrs()->Set(attr_name, static_cast<int64>(value));
+  auto s = op->operation->SetAttrInt(attr_name, value);
+  if (!s.ok()) {
+    LOG(WARNING) << "Unable to set attribute: " << attr_name;
+  }
 }
 
 void TFE_OpSetAttrFloat(TFE_Op* op, const char* attr_name, float value) {
-  op->operation.MutableAttrs()->Set(attr_name, value);
+  auto s = op->operation->SetAttrFloat(attr_name, value);
+  if (!s.ok()) {
+    LOG(WARNING) << "Unable to set attribute: " << attr_name;
+  }
 }
 
 void TFE_OpSetAttrBool(TFE_Op* op, const char* attr_name, unsigned char value) {
-  op->operation.MutableAttrs()->Set(attr_name, (value == 0) ? false : true);
+  auto s = op->operation->SetAttrBool(attr_name, (value == 0) ? false : true);
+  if (!s.ok()) {
+    LOG(WARNING) << "Unable to set attribute: " << attr_name;
+  }
 }
 
 void TFE_OpSetAttrType(TFE_Op* op, const char* attr_name, TF_DataType value) {
-  op->operation.MutableAttrs()->Set(attr_name,
-                                    static_cast<tensorflow::DataType>(value));
+  auto s = op->operation->SetAttrType(attr_name, value);
+  if (!s.ok()) {
+    LOG(WARNING) << "Unable to set attribute: " << attr_name;
+  }
 }
 
 void TFE_OpSetAttrShape(TFE_Op* op, const char* attr_name, const int64_t* dims,
                         const int num_dims, TF_Status* out_status) {
-  if (num_dims > tensorflow::TensorShape::MaxDimensions()) {
-    TF_SetStatus(out_status, TF_INVALID_ARGUMENT,
-                 tensorflow::strings::StrCat(
-                     "Value specified for `", attr_name, "` has ", num_dims,
-                     " dimensions which is over the limit of ",
-                     tensorflow::TensorShape::MaxDimensions(), ".")
-                     .c_str());
-    return;
-  }
-  tensorflow::TensorShapeProto proto;
-  if (num_dims < 0) {
-    proto.set_unknown_rank(true);
-  } else {
-    for (int d = 0; d < num_dims; ++d) {
-      proto.add_dim()->set_size(dims[d]);
-    }
-  }
-  op->operation.MutableAttrs()->Set(attr_name, proto);
+  out_status->status = op->operation->SetAttrShape(attr_name, dims, num_dims);
 }
 
 void TFE_OpSetAttrFunction(TFE_Op* op, const char* attr_name,
                            const TFE_Op* value) {
-  tensorflow::AttrValue attr_value;
-  tensorflow::NameAttrList* func = attr_value.mutable_func();
-  func->set_name(value->operation.Name());
-  value->operation.Attrs().FillAttrValueMap(func->mutable_attr());
-  op->operation.MutableAttrs()->Set(attr_name, attr_value);
+  auto s = op->operation->SetAttrFunction(attr_name, value->operation);
+  if (!s.ok()) {
+    LOG(WARNING) << "Unable to set attribute: " << attr_name;
+  }
 }
 
 void TFE_OpSetAttrFunctionName(TFE_Op* op, const char* attr_name,
                                const char* data, size_t length) {
-  tensorflow::AttrValue attr_value;
-  tensorflow::NameAttrList* func = attr_value.mutable_func();
-  func->set_name(data, length);
-  op->operation.MutableAttrs()->Set(attr_name, attr_value);
+  auto s = op->operation->SetAttrFunctionName(attr_name, data, length);
+  if (!s.ok()) {
+    LOG(WARNING) << "Unable to set attribute: " << attr_name;
+  }
 }
 
 void TFE_OpSetAttrTensor(TFE_Op* op, const char* attr_name, TF_Tensor* tensor,
                          TF_Status* status) {
-  tensorflow::Tensor t;
-  status->status = TF_TensorToTensor(tensor, &t);
-  if (status->status.ok()) op->operation.MutableAttrs()->Set(attr_name, t);
+  status->status = op->operation->SetAttrTensor(attr_name, tensor);
 }
 
 void TFE_OpSetAttrStringList(TFE_Op* op, const char* attr_name,
                              const void* const* values, const size_t* lengths,
                              int num_values) {
-  std::vector<tensorflow::StringPiece> v(num_values);
-  for (int i = 0; i < num_values; ++i) {
-    v[i] = tensorflow::StringPiece(static_cast<const char*>(values[i]),
-                                   lengths[i]);
+  auto s =
+      op->operation->SetAttrStringList(attr_name, values, lengths, num_values);
+  if (!s.ok()) {
+    LOG(WARNING) << "Unable to set attribute: " << attr_name;
   }
-  op->operation.MutableAttrs()->Set(attr_name, v);
 }
 
 void TFE_OpSetAttrFloatList(TFE_Op* op, const char* attr_name,
                             const float* values, int num_values) {
-  op->operation.MutableAttrs()->Set(
-      attr_name, tensorflow::gtl::ArraySlice<const float>(values, num_values));
+  auto s = op->operation->SetAttrFloatList(attr_name, values, num_values);
+  if (!s.ok()) {
+    LOG(WARNING) << "Unable to set attribute: " << attr_name;
+  }
 }
 
 void TFE_OpSetAttrIntList(TFE_Op* op, const char* attr_name,
                           const int64_t* values, int num_values) {
-  op->operation.MutableAttrs()->Set(
-      attr_name, tensorflow::gtl::ArraySlice<const int64>(
-                     reinterpret_cast<const int64*>(values), num_values));
+  auto s = op->operation->SetAttrIntList(attr_name, values, num_values);
+  if (!s.ok()) {
+    LOG(WARNING) << "Unable to set attribute: " << attr_name;
+  }
 }
 
 void TFE_OpSetAttrTypeList(TFE_Op* op, const char* attr_name,
                            const TF_DataType* values, int num_values) {
-  op->operation.MutableAttrs()->Set(
-      attr_name,
-      tensorflow::gtl::ArraySlice<const tensorflow::DataType>(
-          reinterpret_cast<const tensorflow::DataType*>(values), num_values));
+  auto s = op->operation->SetAttrTypeList(attr_name, values, num_values);
+  if (!s.ok()) {
+    LOG(WARNING) << "Unable to set attribute: " << attr_name;
+  }
 }
 
 void TFE_OpSetAttrBoolList(TFE_Op* op, const char* attr_name,
                            const unsigned char* values, int num_values) {
-  std::unique_ptr<bool[]> b(new bool[num_values]);
-  for (int i = 0; i < num_values; ++i) {
-    b[i] = values[i];
+  auto s = op->operation->SetAttrBoolList(attr_name, values, num_values);
+  if (!s.ok()) {
+    LOG(WARNING) << "Unable to set attribute: " << attr_name;
   }
-  op->operation.MutableAttrs()->Set(
-      attr_name, tensorflow::gtl::ArraySlice<const bool>(b.get(), num_values));
 }
 
 void TFE_OpSetAttrShapeList(TFE_Op* op, const char* attr_name,
                             const int64_t** dims, const int* num_dims,
                             int num_values, TF_Status* out_status) {
-  std::unique_ptr<tensorflow::TensorShapeProto[]> proto(
-      new tensorflow::TensorShapeProto[num_values]);
-  for (int i = 0; i < num_values; ++i) {
-    const auto num_dims_i = num_dims[i];
-
-    if (num_dims_i > tensorflow::TensorShape::MaxDimensions()) {
-      TF_SetStatus(out_status, TF_INVALID_ARGUMENT,
-                   tensorflow::strings::StrCat(
-                       "Value specified for `", attr_name, "` has ", num_dims_i,
-                       " dimensions which is over the limit of ",
-                       tensorflow::TensorShape::MaxDimensions(), ".")
-                       .c_str());
-      return;
-    }
-    if (num_dims_i < 0) {
-      proto[i].set_unknown_rank(true);
-    } else {
-      const int64_t* dims_i = dims[i];
-      auto proto_i = &proto[i];
-      for (int d = 0; d < num_dims_i; ++d) {
-        proto_i->add_dim()->set_size(dims_i[d]);
-      }
-    }
-  }
-  op->operation.MutableAttrs()->Set(
-      attr_name, tensorflow::gtl::ArraySlice<tensorflow::TensorShapeProto>(
-                     proto.get(), num_values));
+  out_status->status =
+      op->operation->SetAttrShapeList(attr_name, dims, num_dims, num_values);
 }
 
 void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name,
                                const TFE_Op** value, int num_values) {
-  std::unique_ptr<tensorflow::NameAttrList[]> funcs(
-      new tensorflow::NameAttrList[num_values]);
-  for (int i = 0; i < num_values; i++) {
-    funcs[i].set_name(value[i]->operation.Name());
-    value[i]->operation.Attrs().FillAttrValueMap(funcs[i].mutable_attr());
+  auto s = op->operation->SetAttrFunctionList(attr_name, value, num_values);
+  if (!s.ok()) {
+    LOG(WARNING) << "Unable to set attribute: " << attr_name;
   }
-  op->operation.MutableAttrs()->Set(
-      attr_name, tensorflow::gtl::ArraySlice<const tensorflow::NameAttrList>(
-                     funcs.get(), num_values));
 }
 
 TF_CAPI_EXPORT extern int TFE_OpGetInputLength(TFE_Op* op,
                                                const char* input_name,
                                                TF_Status* status) {
-  const tensorflow::OpDef* op_def = GetOpDef(op, status);
-  if (!status->status.ok()) {
-    return -1;
-  }
-  tensorflow::AttrValueMap attrs;
-  op->operation.Attrs().FillAttrValueMap(&attrs);
-  tensorflow::NameRangeMap name_ranges;
-  status->status = tensorflow::NameRangesForNode(
-      tensorflow::AttrSlice(&attrs), *op_def, &name_ranges, nullptr);
-  if (!status->status.ok()) {
-    return -1;
-  }
-  auto iter = name_ranges.find(input_name);
-  if (iter == name_ranges.end()) {
-    status->status = tensorflow::errors::InvalidArgument("Input '", input_name,
-                                                         "' not found");
-    return -1;
-  }
-  return iter->second.second - iter->second.first;
+  int ret = -1;
+  status->status = op->operation->InputLength(input_name, &ret);
+  return ret;
 }
 
 TF_CAPI_EXPORT extern int TFE_OpGetOutputLength(TFE_Op* op,
                                                 const char* output_name,
                                                 TF_Status* status) {
-  const tensorflow::OpDef* op_def = GetOpDef(op, status);
-  if (!status->status.ok()) {
-    return -1;
-  }
-  tensorflow::AttrValueMap attrs;
-  op->operation.Attrs().FillAttrValueMap(&attrs);
-  tensorflow::NameRangeMap name_ranges;
-  status->status = tensorflow::NameRangesForNode(
-      tensorflow::AttrSlice(&attrs), *op_def, nullptr, &name_ranges);
-  if (!status->status.ok()) {
-    return -1;
-  }
-  auto iter = name_ranges.find(output_name);
-  if (iter == name_ranges.end()) {
-    status->status = tensorflow::errors::InvalidArgument(
-        "Output '", output_name, "' not found");
-    return -1;
-  }
-  return iter->second.second - iter->second.first;
+  int ret = -1;
+  status->status = op->operation->OutputLength(output_name, &ret);
+  return ret;
 }
 
 void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
                  TF_Status* status) {
-  absl::FixedArray<tensorflow::TensorHandle*> handle_retvals(*num_retvals);
-  VLOG(1) << "Calling TFE_Execute() on op " << op;
-  status->status = tensorflow::EagerExecute(&op->operation,
-                                            handle_retvals.data(), num_retvals);
+  absl::FixedArray<std::unique_ptr<AbstractTensorHandleInterface>> handles(
+      *num_retvals);
+  status->status = op->operation->Execute(&handles, num_retvals);
   if (!status->status.ok()) {
     return;
   }
   for (int i = 0; i < *num_retvals; ++i) {
-    retvals[i] = new TFE_TensorHandle{
-        std::make_unique<tensorflow::TensorHandleInterface>(handle_retvals[i])};
+    retvals[i] = new TFE_TensorHandle{std::move(handles[i])};
   }
 }
 
@@ -1673,13 +1592,17 @@ void TFE_ContextStartStep(TFE_Context* ctx) { ctx->context->StartStep(); }
 void TFE_ContextEndStep(TFE_Context* ctx) { ctx->context->EndStep(); }
 
 void TFE_OpGetAttrs(TFE_Op* op, TFE_OpAttrs* attrs) {
-  *attrs = TFE_OpAttrs(&op->operation.Attrs());
+  auto operation = tensorflow::down_cast<tensorflow::OperationInterface*>(
+      op->operation.get());
+  *attrs = TFE_OpAttrs(&operation->Attrs());
 }
 
 void TFE_OpAddAttrs(TFE_Op* op, const TFE_OpAttrs* attrs) {
   tensorflow::AttrValueMap m;
   attrs->attributes->FillAttrValueMap(&m);
-  tensorflow::AttrBuilder* destination = op->operation.MutableAttrs();
+  auto operation = tensorflow::down_cast<tensorflow::OperationInterface*>(
+      op->operation.get());
+  tensorflow::AttrBuilder* destination = operation->MutableAttrs();
   for (auto attribute : m) {
     destination->Set(attribute.first, attribute.second);
   }
diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index 46f1f98b036..4ed9194c554 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -31,8 +31,8 @@ using tensorflow::string;
 void TFE_OpReset(TFE_Op* op_to_reset, const char* op_or_function_name,
                  const char* raw_device_name, TF_Status* status) {
   if (op_to_reset) {
-    status->status = op_to_reset->operation.Reset(
-        op_or_function_name, raw_device_name, false, nullptr);
+    status->status =
+        op_to_reset->operation->Reset(op_or_function_name, raw_device_name);
   } else {
     TF_SetStatus(status, TF_INVALID_ARGUMENT,
                  "op_to_reset should not be nullptr");
@@ -40,9 +40,7 @@ void TFE_OpReset(TFE_Op* op_to_reset, const char* op_or_function_name,
 }
 
 void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
-  op->operation.ConsumeInput(
-      tensorflow::down_cast<tensorflow::TensorHandleInterface*>(h->handle.get())
-          ->Handle());
+  status->status = op->operation->ConsumeInput(h);
 }
 
 void TFE_ContextEnableGraphCollection(TFE_Context* ctx) {
@@ -520,8 +518,7 @@ void TFE_DeleteCancellationManager(
 void TFE_OpSetCancellationManager(TFE_Op* op,
                                   TFE_CancellationManager* cancellation_manager,
                                   TF_Status* status) {
-  op->operation.SetCancellationManager(
-      &cancellation_manager->cancellation_manager);
+  status->status = op->operation->SetCancellationManager(cancellation_manager);
 }
 
 TFE_Executor* TFE_NewExecutor(bool is_async) {
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index 01038a33549..943890b6259 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -27,12 +27,12 @@ limitations under the License.
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/operation_interface.h"
 #include "tensorflow/c/eager/tensor_handle_interface.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
-#include "tensorflow/core/common_runtime/eager/eager_operation.h"
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -89,7 +89,7 @@ struct TFE_TensorDebugInfo {
 };
 
 struct TFE_Op {
-  tensorflow::EagerOperation operation;
+  std::unique_ptr<AbstractOperationInterface> operation;
 };
 
 struct TFE_MonitoringCounterCell {
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 91026a0650c..2bffe783097 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -415,8 +415,10 @@ void TensorHandleSilentCopy(bool async,
                     ->Handle();
 
     // The input handles should never change since they have been mirrored.
-    ASSERT_EQ(matmul->operation.Inputs()[0], arg0);
-    ASSERT_EQ(matmul->operation.Inputs()[1], arg1);
+    auto op = tensorflow::down_cast<tensorflow::OperationInterface*>(
+        matmul->operation.get());
+    ASSERT_EQ(op->GetInput(0), arg0);
+    ASSERT_EQ(op->GetInput(1), arg1);
 
     TFE_DeleteOp(matmul);
     TFE_DeleteTensorHandle(retvals[0]);
@@ -1219,6 +1221,14 @@ TEST(CAPI, TestTFE_TensorHandleCopySharingUnderlyingTensorHandle) {
   TFE_DeleteTensorHandle(h_shares_tensor);
 }
 
+tensorflow::AttrValueMap ExtractAttrs(TFE_Op* op) {
+  tensorflow::AttrValueMap attr_values;
+  tensorflow::down_cast<tensorflow::OperationInterface*>(op->operation.get())
+      ->Attrs()
+      .FillAttrValueMap(&attr_values);
+  return attr_values;
+}
+
 TEST(CAPI, TestTFE_OpInferSingleInputAttrs) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
@@ -1235,8 +1245,7 @@ TEST(CAPI, TestTFE_OpInferSingleInputAttrs) {
   TFE_OpAddInput(minOp, axis, status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
-  tensorflow::AttrValueMap attr_values;
-  minOp->operation.Attrs().FillAttrValueMap(&attr_values);
+  tensorflow::AttrValueMap attr_values = ExtractAttrs(minOp);
   tensorflow::AttrValueMap::const_iterator attr_found = attr_values.find("T");
   EXPECT_NE(attr_found, attr_values.cend());
   EXPECT_EQ(attr_found->second.type(), tensorflow::DataType::DT_FLOAT);
@@ -1275,8 +1284,7 @@ TEST(CAPI, TestTFE_OpInferSingleTypeInputListAttrs) {
   TFE_OpAddInputList(concatOp, inputs, 2, status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
-  tensorflow::AttrValueMap attr_values;
-  concatOp->operation.Attrs().FillAttrValueMap(&attr_values);
+  tensorflow::AttrValueMap attr_values = ExtractAttrs(concatOp);
   tensorflow::AttrValueMap::const_iterator attr_found = attr_values.find("T");
   EXPECT_NE(attr_found, attr_values.cend());
   EXPECT_EQ(attr_found->second.type(), tensorflow::DataType::DT_FLOAT);
@@ -1316,8 +1324,7 @@ TEST(CAPI, TestTFE_OpInferMixedTypeInputListAttrs) {
   TFE_OpAddInputList(assertOp, data, 3, status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
-  tensorflow::AttrValueMap attr_values;
-  assertOp->operation.Attrs().FillAttrValueMap(&attr_values);
+  tensorflow::AttrValueMap attr_values = ExtractAttrs(assertOp);
   tensorflow::AttrValueMap::const_iterator attr_found = attr_values.find("T");
   EXPECT_NE(attr_found, attr_values.cend());
   EXPECT_EQ(attr_found->second.list().type(0), tensorflow::DataType::DT_BOOL);
@@ -1353,16 +1360,15 @@ TEST(CAPI, TestTFE_OpAttrsInferenceDisabledWhenNotCallingOpAddInputList) {
   TFE_TensorHandle* inputs[] = {input1, input2};
   TFE_OpAddInput(concatOp, dim, status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  CHECK(concatOp->operation.OpDef());
+  CHECK(concatOp->operation->OpDef());
   TFE_OpAddInput(concatOp, inputs[0], status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  EXPECT_FALSE(concatOp->operation.OpDef())
+  EXPECT_FALSE(concatOp->operation->OpDef())
       << "Inference context is still present";
   TFE_OpAddInput(concatOp, inputs[1], status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
-  tensorflow::AttrValueMap attr_values;
-  concatOp->operation.Attrs().FillAttrValueMap(&attr_values);
+  tensorflow::AttrValueMap attr_values = ExtractAttrs(concatOp);
   EXPECT_EQ(attr_values.find("T"), attr_values.end());
   EXPECT_EQ(attr_values.find("N"), attr_values.end());
 
@@ -1456,30 +1462,32 @@ TEST(CAPI, TestTFE_OpGetAttrs) {
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteContextOptions(opts);
 
-  TFE_Op* varop = TFE_NewOp(ctx, "VarHandleOp", status);
-  TFE_OpSetAttrType(varop, "dtype", TF_INT64);
-  TFE_OpSetAttrShape(varop, "shape", {}, 0, status);
+  TFE_Op* var_op = TFE_NewOp(ctx, "VarHandleOp", status);
+  TFE_OpSetAttrType(var_op, "dtype", TF_INT64);
+  TFE_OpSetAttrShape(var_op, "shape", {}, 0, status);
   TFE_OpAttrs attributes;
-  TFE_OpGetAttrs(varop, &attributes);
+  TFE_OpGetAttrs(var_op, &attributes);
 
-  TFE_Op* varop_copy = TFE_NewOp(ctx, "VarHandleOp", status);
-  TFE_OpSetAttrType(varop_copy, "dtype", TF_FLOAT);
-  TFE_OpAddAttrs(varop_copy, &attributes);
+  TFE_Op* copy_op = TFE_NewOp(ctx, "VarHandleOp", status);
+  TFE_OpSetAttrType(copy_op, "dtype", TF_FLOAT);
+  TFE_OpAddAttrs(copy_op, &attributes);
   unsigned char is_list = 0;
   ASSERT_EQ(TF_ATTR_TYPE,
-            TFE_OpGetAttrType(varop_copy, "dtype", &is_list, status));
+            TFE_OpGetAttrType(copy_op, "dtype", &is_list, status));
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   ASSERT_EQ(TF_ATTR_SHAPE,
-            TFE_OpGetAttrType(varop_copy, "shape", &is_list, status));
+            TFE_OpGetAttrType(copy_op, "shape", &is_list, status));
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   tensorflow::AttrValueMap attr_values;
-  varop_copy->operation.Attrs().FillAttrValueMap(&attr_values);
+  auto op = tensorflow::down_cast<tensorflow::OperationInterface*>(
+      copy_op->operation.get());
+  op->Attrs().FillAttrValueMap(&attr_values);
   EXPECT_EQ(tensorflow::DT_FLOAT, attr_values.find("dtype")->second.type());
 
   TF_DeleteStatus(status);
-  TFE_DeleteOp(varop);
-  TFE_DeleteOp(varop_copy);
+  TFE_DeleteOp(var_op);
+  TFE_DeleteOp(copy_op);
   TFE_DeleteContext(ctx);
 }
 
diff --git a/tensorflow/c/eager/operation_interface.cc b/tensorflow/c/eager/operation_interface.cc
new file mode 100644
index 00000000000..ce62590fd51
--- /dev/null
+++ b/tensorflow/c/eager/operation_interface.cc
@@ -0,0 +1,319 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/operation_interface.h"
+
+#include "absl/container/fixed_array.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/common_runtime/eager/execute.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+
+OperationInterface::OperationInterface(TFE_Context* ctx)
+    : operation_(ctx->context) {}
+
+const string& OperationInterface::DeviceName() const {
+  absl::variant<Device*, CustomDevice*> variant_device =
+      (operation_.Device() == kVariantDeviceNull)
+          ? operation_.EagerContext().HostCPU()
+          : operation_.Device();
+  return absl::visit([](auto* d) -> const string& { return d->name(); },
+                     variant_device);
+}
+
+Status OperationInterface::SetDeviceName(const char* name) {
+  return operation_.SetDeviceName(name);
+}
+
+Status OperationInterface::SetAttrString(const char* attr_name,
+                                         const char* data, size_t length) {
+  operation_.MutableAttrs()->Set(attr_name, StringPiece(data, length));
+  return Status::OK();
+}
+
+Status OperationInterface::SetAttrInt(const char* attr_name, int64_t value) {
+  operation_.MutableAttrs()->Set(attr_name, static_cast<int64>(value));
+  return Status::OK();
+}
+
+Status OperationInterface::SetAttrFloat(const char* attr_name, float value) {
+  operation_.MutableAttrs()->Set(attr_name, value);
+  return Status::OK();
+}
+
+Status OperationInterface::SetAttrBool(const char* attr_name, bool value) {
+  operation_.MutableAttrs()->Set(attr_name, value);
+  return Status::OK();
+}
+
+Status OperationInterface::SetAttrType(const char* attr_name,
+                                       TF_DataType value) {
+  operation_.MutableAttrs()->Set(attr_name, static_cast<DataType>(value));
+  return Status::OK();
+}
+
+Status OperationInterface::SetAttrShape(const char* attr_name,
+                                        const int64_t* dims,
+                                        const int num_dims) {
+  if (num_dims > TensorShape::MaxDimensions()) {
+    return errors::InvalidArgument("Value specified for `", attr_name, "` has ",
+                                   num_dims,
+                                   " dimensions which is over the limit of ",
+                                   TensorShape::MaxDimensions(), ".");
+  }
+
+  TensorShapeProto proto;
+  if (num_dims < 0) {
+    proto.set_unknown_rank(true);
+  } else {
+    for (int d = 0; d < num_dims; ++d) {
+      proto.add_dim()->set_size(dims[d]);
+    }
+  }
+
+  operation_.MutableAttrs()->Set(attr_name, proto);
+
+  return Status::OK();
+}
+
+Status OperationInterface::SetAttrFunction(
+    const char* attr_name,
+    const std::unique_ptr<AbstractOperationInterface>& value) {
+  AttrValue attr_value;
+  NameAttrList* func = attr_value.mutable_func();
+  func->set_name(value->Name());
+  OperationInterface* value_operation =
+      tensorflow::down_cast<OperationInterface*>(value.get());
+  value_operation->operation_.Attrs().FillAttrValueMap(func->mutable_attr());
+  operation_.MutableAttrs()->Set(attr_name, attr_value);
+  return Status::OK();
+}
+
+Status OperationInterface::SetAttrFunctionName(const char* attr_name,
+                                               const char* data,
+                                               size_t length) {
+  AttrValue attr_value;
+  NameAttrList* func = attr_value.mutable_func();
+  func->set_name(data, length);
+  operation_.MutableAttrs()->Set(attr_name, attr_value);
+  return Status::OK();
+}
+
+Status OperationInterface::SetAttrTensor(const char* attr_name,
+                                         TF_Tensor* tensor) {
+  Tensor t;
+  TF_RETURN_IF_ERROR(TF_TensorToTensor(tensor, &t));
+  operation_.MutableAttrs()->Set(attr_name, t);
+  return Status::OK();
+}
+
+Status OperationInterface::SetAttrStringList(const char* attr_name,
+                                             const void* const* values,
+                                             const size_t* lengths,
+                                             int num_values) {
+  std::vector<StringPiece> v(num_values);
+  for (int i = 0; i < num_values; ++i) {
+    v[i] = StringPiece(static_cast<const char*>(values[i]), lengths[i]);
+  }
+  operation_.MutableAttrs()->Set(attr_name, v);
+
+  return Status::OK();
+}
+
+Status OperationInterface::SetAttrFloatList(const char* attr_name,
+                                            const float* values,
+                                            int num_values) {
+  operation_.MutableAttrs()->Set(
+      attr_name, gtl::ArraySlice<const float>(values, num_values));
+  return Status::OK();
+}
+
+Status OperationInterface::SetAttrIntList(const char* attr_name,
+                                          const int64_t* values,
+                                          int num_values) {
+  operation_.MutableAttrs()->Set(
+      attr_name, gtl::ArraySlice<const int64>(
+                     reinterpret_cast<const int64*>(values), num_values));
+  return Status::OK();
+}
+
+Status OperationInterface::SetAttrTypeList(const char* attr_name,
+                                           const TF_DataType* values,
+                                           int num_values) {
+  operation_.MutableAttrs()->Set(
+      attr_name, gtl::ArraySlice<const DataType>(
+                     reinterpret_cast<const DataType*>(values), num_values));
+  return Status::OK();
+}
+
+Status OperationInterface::SetAttrBoolList(const char* attr_name,
+                                           const unsigned char* values,
+                                           int num_values) {
+  std::unique_ptr<bool[]> b(new bool[num_values]);
+  for (int i = 0; i < num_values; ++i) {
+    b[i] = values[i];
+  }
+  operation_.MutableAttrs()->Set(
+      attr_name, gtl::ArraySlice<const bool>(b.get(), num_values));
+  return Status::OK();
+}
+
+Status OperationInterface::SetAttrShapeList(const char* attr_name,
+                                            const int64_t** dims,
+                                            const int* num_dims,
+                                            int num_values) {
+  std::unique_ptr<TensorShapeProto[]> proto(new TensorShapeProto[num_values]);
+  for (int i = 0; i < num_values; ++i) {
+    const auto num_dims_i = num_dims[i];
+
+    if (num_dims_i > TensorShape::MaxDimensions()) {
+      return errors::InvalidArgument(
+          strings::StrCat("Value specified for `", attr_name, "` has ",
+                          num_dims_i, " dimensions which is over the limit of ",
+                          TensorShape::MaxDimensions(), "."));
+    }
+    if (num_dims_i < 0) {
+      proto[i].set_unknown_rank(true);
+    } else {
+      const int64_t* dims_i = dims[i];
+      auto proto_i = &proto[i];
+      for (int d = 0; d < num_dims_i; ++d) {
+        proto_i->add_dim()->set_size(dims_i[d]);
+      }
+    }
+  }
+  operation_.MutableAttrs()->Set(
+      attr_name, gtl::ArraySlice<TensorShapeProto>(proto.get(), num_values));
+  return Status::OK();
+}
+
+Status OperationInterface::SetAttrFunctionList(const char* attr_name,
+                                               const TFE_Op** value,
+                                               int num_values) {
+  std::unique_ptr<NameAttrList[]> funcs(new NameAttrList[num_values]);
+  for (int i = 0; i < num_values; i++) {
+    auto value_operation =
+        tensorflow::down_cast<OperationInterface*>(value[i]->operation.get());
+    funcs[i].set_name(value_operation->operation_.Name());
+    value_operation->operation_.Attrs().FillAttrValueMap(
+        funcs[i].mutable_attr());
+  }
+  operation_.MutableAttrs()->Set(
+      attr_name, gtl::ArraySlice<const NameAttrList>(funcs.get(), num_values));
+  return Status::OK();
+}
+
+const OpDef* OperationInterface::GetOpDef(Status* status) {
+  const tensorflow::OpDef* op_def = operation_.OpDef();
+  if (op_def) return op_def;
+  *status = OpDefForOp(Name(), &op_def);
+  return op_def;
+}
+
+Status OperationInterface::InputLength(const char* input_name, int* length) {
+  Status status;
+  const tensorflow::OpDef* op_def = GetOpDef(&status);
+  if (!status.ok()) {
+    return status;
+  }
+  AttrValueMap attrs;
+  operation_.Attrs().FillAttrValueMap(&attrs);
+  NameRangeMap name_ranges;
+  TF_RETURN_IF_ERROR(
+      NameRangesForNode(AttrSlice(&attrs), *op_def, &name_ranges, nullptr));
+  auto iter = name_ranges.find(input_name);
+  if (iter == name_ranges.end()) {
+    return errors::InvalidArgument("Input '", input_name, "' not found");
+  }
+  *length = iter->second.second - iter->second.first;
+  return Status::OK();
+}
+
+Status OperationInterface::OutputLength(const char* output_name, int* length) {
+  Status status;
+  const tensorflow::OpDef* op_def = GetOpDef(&status);
+  if (!status.ok()) {
+    return status;
+  }
+  AttrValueMap attrs;
+  operation_.Attrs().FillAttrValueMap(&attrs);
+  NameRangeMap name_ranges;
+  TF_RETURN_IF_ERROR(
+      NameRangesForNode(AttrSlice(&attrs), *op_def, nullptr, &name_ranges));
+  auto iter = name_ranges.find(output_name);
+  if (iter == name_ranges.end()) {
+    return errors::InvalidArgument("Output '", output_name, "' not found");
+  }
+  *length = iter->second.second - iter->second.first;
+  return Status::OK();
+}
+
+Status OperationInterface::AddInput(
+    const std::unique_ptr<AbstractTensorHandleInterface>& input) {
+  TensorHandle* h =
+      tensorflow::down_cast<TensorHandleInterface*>(input.get())->Handle();
+  operation_.AddInput(h);
+  return operation_.MaybeInferSingleInputAttrs(h);
+}
+
+Status OperationInterface::AddInputList(
+    const absl::FixedArray<std::unique_ptr<AbstractTensorHandleInterface>>&
+        inputs) {
+  for (auto& input : inputs) {
+    TensorHandle* h =
+        tensorflow::down_cast<TensorHandleInterface*>(input.get())->Handle();
+    operation_.AddInput(h);
+  }
+  return operation_.InferInputListAttrs(inputs.size());
+}
+
+Status OperationInterface::Execute(
+    absl::FixedArray<std::unique_ptr<AbstractTensorHandleInterface>>* retvals,
+    int* num_retvals) {
+  absl::FixedArray<tensorflow::TensorHandle*> handle_retvals(*num_retvals);
+  TF_RETURN_IF_ERROR(
+      EagerExecute(&operation_, handle_retvals.data(), num_retvals));
+  for (int i = 0; i < *num_retvals; ++i) {
+    retvals->at(i).reset(
+        new tensorflow::TensorHandleInterface(handle_retvals[i]));
+  }
+  return Status::OK();
+}
+
+Status OperationInterface::SetCancellationManager(
+    TFE_CancellationManager* cancellation_manager) {
+  operation_.SetCancellationManager(
+      &cancellation_manager->cancellation_manager);
+  return Status::OK();
+}
+
+Status OperationInterface::SetUseXla(bool enable) {
+  operation_.SetUseXla(enable);
+  return Status::OK();
+}
+
+Status OperationInterface::ConsumeInput(TFE_TensorHandle* h) {
+  auto handle =
+      tensorflow::down_cast<TensorHandleInterface*>(h->handle.get())->Handle();
+  operation_.ConsumeInput(handle);
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/eager/operation_interface.h b/tensorflow/c/eager/operation_interface.h
new file mode 100644
index 00000000000..189d4b4e333
--- /dev/null
+++ b/tensorflow/c/eager/operation_interface.h
@@ -0,0 +1,192 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_OPERATION_INTERFACE_H_
+#define TENSORFLOW_C_EAGER_OPERATION_INTERFACE_H_
+
+#include <memory>
+
+#include "absl/container/fixed_array.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+
+// Abstract interface to an operation.
+class AbstractOperationInterface {
+ public:
+  virtual ~AbstractOperationInterface() {}
+
+  virtual void Clear() = 0;
+  virtual tensorflow::Status Reset(const char* op,
+                                   const char* raw_device_name) = 0;
+
+  virtual const tensorflow::string& Name() const = 0;
+  virtual const tensorflow::string& DeviceName() const = 0;
+  virtual tensorflow::Status SetDeviceName(const char* name) = 0;
+
+  virtual tensorflow::Status AddInput(
+      const std::unique_ptr<AbstractTensorHandleInterface>& input) = 0;
+  virtual tensorflow::Status AddInputList(
+      const absl::FixedArray<std::unique_ptr<AbstractTensorHandleInterface>>&
+          inputs) = 0;
+  virtual tensorflow::Status Execute(
+      absl::FixedArray<std::unique_ptr<AbstractTensorHandleInterface>>* retvals,
+      int* num_retvals) = 0;
+  virtual const tensorflow::OpDef* OpDef() const = 0;
+
+  virtual tensorflow::Status SetAttrString(const char* attr_name,
+                                           const char* data, size_t length) = 0;
+  virtual tensorflow::Status SetAttrInt(const char* attr_name,
+                                        int64_t value) = 0;
+  virtual tensorflow::Status SetAttrFloat(const char* attr_name,
+                                          float value) = 0;
+  virtual tensorflow::Status SetAttrBool(const char* attr_name, bool value) = 0;
+  virtual tensorflow::Status SetAttrType(const char* attr_name,
+                                         TF_DataType value) = 0;
+  virtual tensorflow::Status SetAttrShape(const char* attr_name,
+                                          const int64_t* dims,
+                                          const int num_dims) = 0;
+  virtual tensorflow::Status SetAttrFunction(
+      const char* attr_name,
+      const std::unique_ptr<AbstractOperationInterface>& value) = 0;
+  virtual tensorflow::Status SetAttrFunctionName(const char* attr_name,
+                                                 const char* value,
+                                                 size_t length) = 0;
+  virtual tensorflow::Status SetAttrTensor(const char* attr_name,
+                                           TF_Tensor* tensor) = 0;
+  virtual tensorflow::Status SetAttrStringList(const char* attr_name,
+                                               const void* const* values,
+                                               const size_t* lengths,
+                                               int num_values) = 0;
+  virtual tensorflow::Status SetAttrFloatList(const char* attr_name,
+                                              const float* values,
+                                              int num_values) = 0;
+  virtual tensorflow::Status SetAttrIntList(const char* attr_name,
+                                            const int64_t* values,
+                                            int num_values) = 0;
+  virtual tensorflow::Status SetAttrTypeList(const char* attr_name,
+                                             const TF_DataType* values,
+                                             int num_values) = 0;
+  virtual tensorflow::Status SetAttrBoolList(const char* attr_name,
+                                             const unsigned char* values,
+                                             int num_values) = 0;
+  virtual tensorflow::Status SetAttrShapeList(const char* attr_name,
+                                              const int64_t** dims,
+                                              const int* num_dims,
+                                              int num_values) = 0;
+  virtual tensorflow::Status SetAttrFunctionList(const char* attr_name,
+                                                 const TFE_Op** value,
+                                                 int num_values) = 0;
+
+  virtual tensorflow::Status InputLength(const char* input_name,
+                                         int* length) = 0;
+  virtual tensorflow::Status OutputLength(const char* output_name,
+                                          int* length) = 0;
+
+  // Experimental
+  virtual tensorflow::Status SetUseXla(bool enable) {
+    return tensorflow::errors::Unimplemented("SetUseXla not implemented");
+  }
+  virtual tensorflow::Status ConsumeInput(TFE_TensorHandle* h) {
+    return tensorflow::errors::Unimplemented("ConsumeInput not implemented");
+  }
+  virtual tensorflow::Status SetCancellationManager(
+      TFE_CancellationManager* cancellation_manager) {
+    return tensorflow::errors::Unimplemented(
+        "SetCancellationManager not implemented");
+  }
+};
+
+namespace tensorflow {
+
+class OpDef;
+
+class OperationInterface : public AbstractOperationInterface {
+ public:
+  explicit OperationInterface(TFE_Context* ctx);
+  ~OperationInterface() override{};
+
+  void Clear() override { operation_.Clear(); }
+  Status Reset(const char* op, const char* raw_device_name) override {
+    return operation_.Reset(op, raw_device_name, false, nullptr);
+  }
+
+  const string& Name() const override { return operation_.Name(); }
+  const string& DeviceName() const override;
+  Status SetDeviceName(const char* name) override;
+
+  Status AddInput(
+      const std::unique_ptr<AbstractTensorHandleInterface>& input) override;
+  Status AddInputList(
+      const absl::FixedArray<std::unique_ptr<AbstractTensorHandleInterface>>&
+          inputs) override;
+  Status Execute(
+      absl::FixedArray<std::unique_ptr<AbstractTensorHandleInterface>>* retvals,
+      int* num_retvals) override;
+  const tensorflow::OpDef* OpDef() const override {
+    return operation_.OpDef();
+  };
+
+  Status SetAttrString(const char* attr_name, const char* data,
+                       size_t length) override;
+  Status SetAttrInt(const char* attr_name, int64_t value) override;
+  Status SetAttrFloat(const char* attr_name, float value) override;
+  Status SetAttrBool(const char* attr_name, bool value) override;
+  Status SetAttrType(const char* attr_name, TF_DataType value) override;
+  Status SetAttrShape(const char* attr_name, const int64_t* dims,
+                      const int num_dims) override;
+  Status SetAttrFunction(
+      const char* attr_name,
+      const std::unique_ptr<AbstractOperationInterface>& value) override;
+  Status SetAttrFunctionName(const char* attr_name, const char* data,
+                             size_t length) override;
+  Status SetAttrTensor(const char* attr_name, TF_Tensor* tensor) override;
+  Status SetAttrStringList(const char* attr_name, const void* const* values,
+                           const size_t* lengths, int num_values) override;
+  Status SetAttrFloatList(const char* attr_name, const float* values,
+                          int num_values) override;
+  Status SetAttrIntList(const char* attr_name, const int64_t* values,
+                        int num_values) override;
+  Status SetAttrTypeList(const char* attr_name, const TF_DataType* values,
+                         int num_values) override;
+  Status SetAttrBoolList(const char* attr_name, const unsigned char* values,
+                         int num_values) override;
+  Status SetAttrShapeList(const char* attr_name, const int64_t** dims,
+                          const int* num_dims, int num_values) override;
+  Status SetAttrFunctionList(const char* attr_name, const TFE_Op** value,
+                             int num_values) override;
+
+  Status InputLength(const char* input_name, int* length) override;
+  Status OutputLength(const char* output_name, int* length) override;
+
+  Status SetUseXla(bool enable) override;
+  Status ConsumeInput(TFE_TensorHandle* h) override;
+  Status SetCancellationManager(
+      TFE_CancellationManager* cancellation_manager) override;
+
+  // TODO(gjn): Remove once TFE_InferShapes is removed
+  const tensorflow::AttrBuilder& Attrs() const { return operation_.Attrs(); }
+  tensorflow::AttrBuilder* MutableAttrs() { return operation_.MutableAttrs(); }
+
+  const TensorHandle* GetInput(int i) const { return operation_.Inputs()[i]; }
+
+ private:
+  const tensorflow::OpDef* GetOpDef(Status* status);
+  EagerOperation operation_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_OPERATION_INTERFACE_H_
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.cc b/tensorflow/core/common_runtime/eager/attr_builder.cc
index 66d9063910e..69365e34ca0 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder.cc
@@ -54,7 +54,7 @@ const AttrTypeMap* GetDefaultFunctionAttrTypeMap() {
 
 }  // namespace
 
-Status OpDefForOp(const char* op_name, const OpDef** op_def) {
+Status OpDefForOp(const string& op_name, const OpDef** op_def) {
   const OpRegistrationData* op_reg_data = nullptr;
   Status s = OpRegistry::Global()->LookUp(op_name, &op_reg_data);
   if (s.ok()) {
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h
index 65a52efb740..1a871b01a4d 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.h
+++ b/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -42,7 +42,7 @@ namespace tensorflow {
 typedef std::unordered_map<string, uint32> AttrTypeMap;
 
 // Look up OpDef for `op_name`.
-Status OpDefForOp(const char* op_name, const OpDef** op_def);
+Status OpDefForOp(const string& op_name, const OpDef** op_def);
 
 // Returns the AttrTypeMap for the TensorFlow operation named op_name.
 // If op_name is not registered in global op registry, AttrTypeMapForOp assumes
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index cfde6f0e09d..0261818ac96 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -55,6 +55,7 @@ class EagerOperation {
   bool is_function() const { return is_function_; }
 
   tensorflow::EagerContext& EagerContext() { return ctx_; }
+  const tensorflow::EagerContext& EagerContext() const { return ctx_; }
 
   AttrBuilder* MutableAttrs() { return &attrs_; }
   const AttrBuilder& Attrs() const { return attrs_; }
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 8ca02ca51c0..d0d961a0055 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -136,7 +136,7 @@ Status KernelAndDeviceFunc::InstantiateFunc(const NodeDef& ndef,
   if (function_def != nullptr) {
     op_def = &(function_def->signature());
   } else {
-    TF_RETURN_IF_ERROR(OpDefForOp(ndef.op().c_str(), &op_def));
+    TF_RETURN_IF_ERROR(OpDefForOp(ndef.op(), &op_def));
   }
   TF_RETURN_IF_ERROR(
       InOutTypesForNode(ndef, *op_def, &input_dtypes_, &output_dtypes_));
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 39ea862ba5e..f64b05aa599 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/casts.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
@@ -74,10 +75,9 @@ TFE_Op* GetOp(TFE_Context* ctx, const char* op_or_function_name,
               const char* raw_device_name, TF_Status* status) {
   std::unique_ptr<TFE_Op> op = ReleaseThreadLocalOp(ctx);
   if (!op) {
-    op.reset(new TFE_Op{tensorflow::EagerOperation(ctx->context)});
+    op.reset(new TFE_Op{std::make_unique<tensorflow::OperationInterface>(ctx)});
   }
-  status->status =
-      op->operation.Reset(op_or_function_name, raw_device_name, false, nullptr);
+  status->status = op->operation->Reset(op_or_function_name, raw_device_name);
   if (!status->status.ok()) {
     op.reset();
   }
@@ -86,7 +86,7 @@ TFE_Op* GetOp(TFE_Context* ctx, const char* op_or_function_name,
 
 void ReturnOp(TFE_Context* ctx, TFE_Op* op) {
   if (op) {
-    op->operation.Clear();
+    op->operation->Clear();
     thread_local_eager_operation_map[ctx].reset(op);
   }
 }
@@ -3393,7 +3393,7 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
     return nullptr;
   }
 
-  const tensorflow::OpDef* op_def = op->operation.OpDef();
+  const tensorflow::OpDef* op_def = op->operation->OpDef();
   if (op_def == nullptr) return nullptr;
 
   if (args_size < kFastPathExecuteInputStartIndex + op_def->input_arg_size()) {

From f8b2a05ee9cbf5210ef14b768cbdf39bacac04d7 Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Wed, 19 Feb 2020 23:26:34 -0800
Subject: [PATCH 334/442] Add function to load saved model for tflite mlir
 converter.

PiperOrigin-RevId: 296139934
Change-Id: I1c608c2971d81e5efa38925ee9fe4b80f437726a
---
 tensorflow/compiler/mlir/lite/BUILD           |  1 +
 .../compiler/mlir/lite/tf_tfl_translate.cc    | 25 ++++++++++---
 .../compiler/mlir/lite/tf_tfl_translate_cl.cc | 27 ++++++++++++++
 .../compiler/mlir/lite/tf_tfl_translate_cl.h  |  6 +++
 .../mlir/lite/tf_to_tfl_flatbuffer.cc         | 37 +++++++++++++++++++
 .../compiler/mlir/lite/tf_to_tfl_flatbuffer.h |  6 +++
 6 files changed, 96 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 1ab9b70555d..8d51dd3cfc2 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -651,6 +651,7 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir:init_mlir",
         "//tensorflow/compiler/mlir/tensorflow:translate_cl_options",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:errors",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/stream_executor/lib",
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
index 914156deaae..7f8ce4cf3d4 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
@@ -132,12 +133,24 @@ int main(int argc, char **argv) {
   llvm::SourceMgr source_mgr;
   mlir::SourceMgrDiagnosticHandler sourceMgrHandler(source_mgr, &context);
 
-  StatusOr<mlir::OwningModuleRef> module =
-      tensorflow::LoadFromGraphdefOrMlirSource(
-          input_file_name, input_mlir, use_splatted_constant, custom_opdefs,
-          debug_info_file, input_arrays, input_dtypes, input_shapes,
-          output_arrays,
-          /*prune_unused_nodes=*/true, &source_mgr, &context);
+  StatusOr<mlir::OwningModuleRef> module;
+
+  // TODO(b/147435528): We need to test the e2e behavior once the graph freezing
+  // inside mlir is done.
+  if (import_saved_model || import_saved_model_v1) {
+    if (input_mlir)
+      module = tensorflow::errors::InvalidArgument(
+          "Importing saved model should not have input_mlir set");
+    module = tensorflow::ImportSavedModel(
+        import_saved_model, import_saved_model_v1, input_file_name,
+        saved_model_tags, saved_model_exported_names, &context);
+  } else {
+    module = tensorflow::LoadFromGraphdefOrMlirSource(
+        input_file_name, input_mlir, use_splatted_constant, custom_opdefs,
+        debug_info_file, input_arrays, input_dtypes, input_shapes,
+        output_arrays,
+        /*prune_unused_nodes=*/true, &source_mgr, &context);
+  }
 
   // If errors occur, the library call in the above already logged the error
   // message. So we can just return here.
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.cc
index 3ec0769db30..de569a3496c 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.cc
@@ -22,6 +22,33 @@ using llvm::cl::opt;
 opt<std::string> input_file_name(llvm::cl::Positional,
                                  llvm::cl::desc("<input file>"),
                                  llvm::cl::init("-"));
+
+// NOLINTNEXTLINE
+opt<bool> import_saved_model(
+    "savedmodel-to-mlir",
+    llvm::cl::desc("Import a saved model to its MLIR representation"),
+    llvm::cl::value_desc("dir"));
+
+// NOLINTNEXTLINE
+opt<bool> import_saved_model_v1(
+    "savedmodel-v1-to-mlir",
+    llvm::cl::desc("Import a saved model V1 to its MLIR representation"),
+    llvm::cl::value_desc("dir"));
+
+// NOLINTNEXTLINE
+opt<std::string> saved_model_tags(
+    "tf-savedmodel-tags",
+    llvm::cl::desc("Tags used to indicate which MetaGraphDef to import, "
+                   "separated by ','"),
+    llvm::cl::init("serve"));
+
+// NOLINTNEXTLINE
+opt<std::string> saved_model_exported_names(
+    "tf-savedmodel-exported-names",
+    llvm::cl::desc("Names to export from SavedModel, separated by ','. Empty "
+                   "(the default) means export all."),
+    llvm::cl::init(""));
+
 // NOLINTNEXTLINE
 opt<std::string> output_file_name("o", llvm::cl::desc("<output file>"),
                                   llvm::cl::value_desc("filename"),
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.h b/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.h
index faa74865f5f..d7e54d70b81 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.h
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.h
@@ -39,4 +39,10 @@ extern llvm::cl::opt<bool> inline_functions;
 extern llvm::cl::list<std::string> custom_opdefs;
 extern llvm::cl::opt<bool> emit_quant_adaptor_ops;
 extern llvm::cl::opt<std::string> quant_stats_file_name;
+
+// Import saved model.
+extern llvm::cl::opt<bool> import_saved_model;
+extern llvm::cl::opt<bool> import_saved_model_v1;
+extern llvm::cl::opt<std::string> saved_model_tags;
+extern llvm::cl::opt<std::string> saved_model_exported_names;
 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_TF_TFL_TRANSLATE_CL_H_
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index 6ea1ca26d62..f5097e1c01b 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
 
+#include <string>
+#include <unordered_set>
+
+#include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/Attributes.h"  // TF:llvm-project
 #include "mlir/IR/Module.h"  // TF:llvm-project
 #include "mlir/Parser.h"  // TF:llvm-project
@@ -155,4 +159,37 @@ Status ConvertTFExecutorToTFLOrFlatbuffer(
   return Status::OK();
 }
 
+StatusOr<mlir::OwningModuleRef> ImportSavedModel(
+    bool import_saved_model, bool import_saved_model_v1,
+    const std::string& input_filename, const std::string& saved_model_tags,
+    const std::string& saved_model_exported_names, mlir::MLIRContext* context) {
+  if (import_saved_model) {
+    std::unordered_set<std::string> tags =
+        absl::StrSplit(saved_model_tags, ',');
+    std::vector<std::string> exported_names =
+        absl::StrSplit(saved_model_exported_names, ',', absl::SkipEmpty());
+
+    auto module = tensorflow::SavedModelToMlirImport(
+        input_filename, tags, absl::Span<std::string>(exported_names), context);
+    if (!module)
+      return tensorflow::errors::InvalidArgument("fail to open input file");
+
+    return module;
+  } else if (import_saved_model_v1) {
+    std::unordered_set<std::string> tags =
+        absl::StrSplit(saved_model_tags, ',');
+
+    auto module =
+        tensorflow::SavedModelV1ToMlirImport(input_filename, tags, context);
+
+    if (!module)
+      return tensorflow::errors::InvalidArgument("fail to open input file");
+
+    return module;
+  } else {
+    return tensorflow::errors::InvalidArgument(
+        "Should be either saved model v1 or v2");
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
index 6f002af463b..f670ac8e52b 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
@@ -40,6 +40,12 @@ LoadFromGraphdefOrMlirSource(
     absl::string_view output_arrays, bool prune_unused_nodes,
     llvm::SourceMgr* source_mgr, mlir::MLIRContext* context);
 
+// Load Saved model (either v1 or v2) into MLIR.
+stream_executor::port::StatusOr<mlir::OwningModuleRef> ImportSavedModel(
+    bool import_saved_model, bool import_saved_model_v1,
+    const std::string& input_filename, const std::string& saved_model_tags,
+    const std::string& saved_model_exported_names, mlir::MLIRContext* context);
+
 // Taking a MLIR module in TF executor dialect and a set of parameters,
 // applies a set of passes to convert the module to TF Lite dialect and
 // serializes the result to a string. Depending on an attribute in the module

From 5ce081684d760ebadf22398cab4bd96958a7aa23 Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Wed, 19 Feb 2020 23:29:51 -0800
Subject: [PATCH 335/442] Legalize ophint converted node in mlir. (this is the
 first cl of a series of cls for import ophint python directly converted
 model).

PiperOrigin-RevId: 296140203
Change-Id: I726b4b88d7fe7878c283f7806b15538304fba7f9
---
 .../compiler/mlir/lite/tests/legalize-tf.mlir | 45 +++++++++++
 .../mlir/lite/transforms/legalize_tf.cc       | 74 +++++++++++++++++++
 2 files changed, 119 insertions(+)

diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index 408975586d6..e40047ea216 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -1408,3 +1408,48 @@ func @random_uniform_no_fold3() -> tensor<2x5xf64> {
   // CHECK-LABEL: random_uniform_no_fold3
   // CHECK: %[[RANDOM:.*]] = "tf.RandomUniform"
 }
+
+func @LstmWithoutProjection(%arg: tensor<28x1x28xf32>) -> (tensor<28x1x16xf32>) {
+  %1 = "tf.Const"() {device = "", dtype = f32, value = dense<0.000000e+00>: tensor<16x28xf32>} : () -> tensor<16x28xf32>
+  %2 = "tf.Const"() {device = "", dtype = f32, value = dense<0.000000e+00>: tensor<16x16xf32>} : () -> tensor<16x16xf32>
+  %3 = "tf.Const"() {device = "", dtype = f32, value = dense<0.000000e+00>: tensor<16xf32>} : () -> tensor<16xf32>
+  %4 = "tf.Const"() {device = "", dtype = f32, value = dense<0.000000e+00>: tensor<1x16xf32>} : () -> tensor<1x16xf32>
+  %5 = "tf.Const"() {device = "", dtype = f32, value = dense<-1.000000e+00> : tensor<1xf32>} : () -> tensor<1xf32>
+  %6:3 = "tf.UnidirectionalSequenceLstm"(%arg, %1, %1, %1, %1, %2, %2, %2, %2, %3, %3, %3, %3, %3, %3, %3, %5, %5, %4, %4) {_tflite_input_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19], device = ""} : (tensor<28x1x28xf32>, tensor<16x28xf32>, tensor<16x28xf32>, tensor<16x28xf32>, tensor<16x28xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1x16xf32>, tensor<1x16xf32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<28x1x16xf32>)
+  return %6#2 : tensor<28x1x16xf32>
+}
+
+// CHECK:       func @LstmWithoutProjection([[VAL_0:%.*]]: tensor<28x1x28xf32>) -> tensor<28x1x16xf32> {
+// CHECK:           [[VAL_1:%.*]] = constant dense<0.000000e+00> : tensor<16x28xf32>
+// CHECK:           [[VAL_2:%.*]] = constant dense<0.000000e+00> : tensor<16x16xf32>
+// CHECK:           [[VAL_3:%.*]] = constant dense<0.000000e+00> : tensor<16xf32>
+// CHECK:           [[VAL_4:%.*]] = constant dense<0.000000e+00> : tensor<1x16xf32>
+// CHECK:           [[VAL_5:%.*]] = constant unit
+// CHECK:           [[VAL_6:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_0]], [[VAL_1]], [[VAL_1]], [[VAL_1]], [[VAL_1]], [[VAL_2]], [[VAL_2]], [[VAL_2]], [[VAL_2]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_3]], [[VAL_5]], [[VAL_5]], [[VAL_4]], [[VAL_4]], [[VAL_5]], [[VAL_5]], [[VAL_5]], [[VAL_5]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<28x1x28xf32>, tensor<16x28xf32>, tensor<16x28xf32>, tensor<16x28xf32>, tensor<16x28xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, none, none, tensor<1x16xf32>, tensor<1x16xf32>, none, none, none, none) -> tensor<28x1x16xf32>
+// CHECK:           return [[VAL_6]] : tensor<28x1x16xf32>
+// CHECK:         }
+
+func @LstmWithProjection(%arg: tensor<28x1x16xf32>) -> (tensor<28x1x8xf32>) {
+  %1 = "tf.Const"() {device = "", dtype = f32, value = dense<0.000000e+00>: tensor<16x16xf32>} : () -> tensor<16x16xf32>
+  %2 = "tf.Const"() {device = "", dtype = f32, value = dense<0.000000e+00>: tensor<16x8xf32>} : () -> tensor<16x8xf32>
+  %3 = "tf.Const"() {device = "", dtype = f32, value = dense<0.000000e+00>: tensor<16xf32>} : () -> tensor<16xf32>
+  %4 = "tf.Const"() {device = "", dtype = f32, value = dense<0.000000e+00>: tensor<1x16xf32>} : () -> tensor<1x16xf32>
+  %5 = "tf.Const"() {device = "", dtype = f32, value = dense<0.000000e+00>: tensor<8x16xf32>} : () -> tensor<8x16xf32>
+  %6 = "tf.Const"() {device = "", dtype = f32, value = dense<0.000000e+00>: tensor<1x8xf32>} : () -> tensor<1x8xf32>
+  %7 = "tf.Const"() {device = "", dtype = f32, value = dense<-1.000000e+00> : tensor<1xf32>} : () -> tensor<1xf32>
+  %8:3 = "tf.UnidirectionalSequenceLstm"(%arg, %1, %1, %1, %1, %2, %2, %2, %2, %7, %7, %7, %3, %3, %3, %3, %5, %7, %6, %4) {_tflite_input_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 13, 14, 15, 16, 18, 19], device = ""} : (tensor<28x1x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x8xf32>, tensor<16x8xf32>, tensor<16x8xf32>, tensor<16x8xf32>, tensor<1xf32>, tensor<1xf32>, tensor<1xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<8x16xf32>, tensor<1xf32>, tensor<1x8xf32>, tensor<1x16xf32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<28x1x8xf32>)
+  return %8#2 : tensor<28x1x8xf32>
+}
+
+// CHECK-LABEL:   func @LstmWithProjection(
+// CHECK-SAME:                             [[VAL_7:%.*]]: tensor<28x1x16xf32>) -> tensor<28x1x8xf32> {
+// CHECK:           [[VAL_8:%.*]] = constant dense<0.000000e+00> : tensor<16x16xf32>
+// CHECK:           [[VAL_9:%.*]] = constant dense<0.000000e+00> : tensor<16x8xf32>
+// CHECK:           [[VAL_10:%.*]] = constant dense<0.000000e+00> : tensor<16xf32>
+// CHECK:           [[VAL_11:%.*]] = constant dense<0.000000e+00> : tensor<1x16xf32>
+// CHECK:           [[VAL_12:%.*]] = constant dense<0.000000e+00> : tensor<8x16xf32>
+// CHECK:           [[VAL_13:%.*]] = constant dense<0.000000e+00> : tensor<1x8xf32>
+// CHECK:           [[VAL_14:%.*]] = constant unit
+// CHECK:           [[VAL_15:%.*]] = "tfl.unidirectional_sequence_lstm"([[VAL_7]], [[VAL_8]], [[VAL_8]], [[VAL_8]], [[VAL_8]], [[VAL_9]], [[VAL_9]], [[VAL_9]], [[VAL_9]], [[VAL_14]], [[VAL_14]], [[VAL_14]], [[VAL_10]], [[VAL_10]], [[VAL_10]], [[VAL_10]], [[VAL_12]], [[VAL_14]], [[VAL_13]], [[VAL_11]], [[VAL_14]], [[VAL_14]], [[VAL_14]], [[VAL_14]]) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = true} : (tensor<28x1x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x16xf32>, tensor<16x8xf32>, tensor<16x8xf32>, tensor<16x8xf32>, tensor<16x8xf32>, none, none, none, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<16xf32>, tensor<8x16xf32>, none, tensor<1x8xf32>, tensor<1x16xf32>, none, none, none, none) -> tensor<28x1x8xf32>
+// CHECK:           return [[VAL_15]] : tensor<28x1x8xf32>
+// CHECK:         }
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
index 99e7e99f66a..7501832099a 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
@@ -63,6 +63,9 @@ namespace {
 using xla::Status;
 using xla::StatusOr;
 
+constexpr char kUnidirectionalSequenceLstm[] = "tf.UnidirectionalSequenceLstm";
+constexpr char kTfLiteInputIndices[] = "_tflite_input_indices";
+
 // Legalize operations in functions.
 struct LegalizeTF : public FunctionPass<LegalizeTF> {
   void runOnFunction() override;
@@ -561,6 +564,74 @@ PatternMatchResult ConvertTFReciprocalOp::matchAndRewrite(
   return matchSuccess();
 }
 
+// Legalize unidirectional sequence lstm.
+struct LegalizeUnidirectionalSequenceLstm : public RewritePattern {
+  explicit LegalizeUnidirectionalSequenceLstm(MLIRContext* context)
+      : RewritePattern(kUnidirectionalSequenceLstm, 1, context) {}
+
+  PatternMatchResult matchAndRewrite(Operation* op,
+                                     PatternRewriter& rewriter) const override {
+    auto tflite_indices_attr =
+        op->getAttrOfType<ArrayAttr>(kTfLiteInputIndices);
+    if (!tflite_indices_attr) return matchFailure();
+
+    SmallVector<int64_t, 20> tflite_indices;
+    for (auto index_attr : tflite_indices_attr.getValue()) {
+      IntegerAttr index = index_attr.cast<IntegerAttr>();
+      tflite_indices.push_back(index.getInt());
+    }
+
+    // Optional input placeholder.
+    Value none = rewriter.create<mlir::ConstantOp>(
+        op->getLoc(), rewriter.getNoneType(), rewriter.getUnitAttr());
+
+    // Populate inputs.
+    // UnidirectionalSequenceLstm is expected to have 24 inputs.
+    SmallVector<Value, 24> inputs;
+    int count = 0;
+    int total_ophint_converted_inputs = tflite_indices.size();
+    for (int i = 0; i < 24; ++i) {
+      if (count < total_ophint_converted_inputs && tflite_indices[count] == i) {
+        // specified input.
+        inputs.push_back(op->getOperand(i));
+        count++;
+      } else {
+        // Non specified input.
+        inputs.push_back(none);
+      }
+    }
+
+    // Populate outputs.
+    // UnidirectionalSequenceLstm should only have 1 output, and that is the
+    // original ophint converted node's 3rd output.
+    SmallVector<Type, 4> result_types;
+    result_types.push_back(op->getOpResult(2).getType());
+
+    // Populate attributes.
+    SmallVector<NamedAttribute, 4> attributes;
+    // Activation will always be tanh.
+    attributes.push_back(rewriter.getNamedAttr("fused_activation_function",
+                                               rewriter.getStringAttr("TANH")));
+    // cell_clip.
+    attributes.push_back(
+        rewriter.getNamedAttr("cell_clip", rewriter.getF32FloatAttr(10.0)));
+    // proj_clip.
+    attributes.push_back(
+        rewriter.getNamedAttr("proj_clip", rewriter.getF32FloatAttr(0.0)));
+    // will always be time_majored.
+    attributes.push_back(
+        rewriter.getNamedAttr("time_major", rewriter.getBoolAttr(true)));
+
+    auto lstm_op = rewriter.create<TFL::UnidirectionalSequenceLSTMOp>(
+        op->getLoc(), result_types, inputs, attributes);
+
+    // Rewire the output.
+    op->getResult(2).replaceAllUsesWith(lstm_op.getResult());
+    op->erase();
+    return matchSuccess();
+  }
+};
+
 void LegalizeTF::runOnFunction() {
   OwningRewritePatternList patterns;
   auto* ctx = &getContext();
@@ -574,6 +645,9 @@ void LegalizeTF::runOnFunction() {
               ConvertTFReshapeOp, ConvertTFSplitOp, ConvertTFSplitVOp,
               ConvertTFStridedSliceOp, ConvertTFUnpackOp, ConvertTFAssertOp,
               ConvertTFReciprocalOp, ConvertTFRandomUniformOp>(ctx);
+
+  // Ophint python converter converted tf node pattern.
+  patterns.insert<LegalizeUnidirectionalSequenceLstm>(ctx);
   applyPatternsGreedily(func, patterns);
 }
 

From a6ec8dadc4a8fb5d3df6577cb903483f2582c0a8 Mon Sep 17 00:00:00 2001
From: Blake Hechtman <blakehechtman@google.com>
Date: Wed, 19 Feb 2020 23:58:40 -0800
Subject: [PATCH 336/442] [XLA] Avoid hash collisions in CseHash.

PiperOrigin-RevId: 296143190
Change-Id: I16cef346311b419f04911c241462fa55a5aa04ad
---
 tensorflow/compiler/xla/service/BUILD      |  1 +
 tensorflow/compiler/xla/service/hlo_cse.cc | 46 ++++++++++++++++++++--
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 34fd40f11d8..bb6219eb584 100755
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -3434,6 +3434,7 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/core:lib",
+        "//tensorflow/core/platform:hash",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
     ],
diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc
index a58fcf4460a..373f4f12ba4 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/hash.h"
 
 namespace xla {
 
@@ -96,17 +97,54 @@ StatusOr<bool> CombineConstants(HloComputation* computation,
 // share the exact same set of operands.
 int64 CseHash(const HloInstruction* instruction) {
   int64 hash = std::hash<int64>()(static_cast<int64>(instruction->opcode()));
+  auto c_hash = [](auto c) {
+    return tensorflow::Hash64(reinterpret_cast<const char*>(c.data()),
+                              c.size() * sizeof(c[0]));
+  };
+  auto proto_hash = [](auto proto) {
+    return std::hash<int64>{}(proto.ByteSizeLong());
+  };
   hash = tensorflow::Hash64Combine(
       hash, instruction->opcode() == HloOpcode::kGetTupleElement
                 ? instruction->tuple_index()
-                : -1);
+                : c_hash(instruction->shape().dimensions()));
   for (auto operand : instruction->operands()) {
     hash = tensorflow::Hash64Combine(hash, operand->unique_id());
   }
-  if (instruction->opcode() == HloOpcode::kConstant) {
-    hash = tensorflow::Hash64Combine(hash, instruction->literal().Hash());
+  for (auto c : instruction->called_computations()) {
+    hash = tensorflow::Hash64Combine(
+        hash, std::hash<int64>()(
+                  static_cast<int64>(c->root_instruction()->opcode())));
+  }
+  switch (instruction->opcode()) {
+    case HloOpcode::kConstant:
+      return tensorflow::Hash64Combine(hash, instruction->literal().Hash());
+    case HloOpcode::kSlice:
+      return tensorflow::Hash64Combine(
+          tensorflow::Hash64Combine(hash, c_hash(instruction->slice_starts())),
+          c_hash(instruction->slice_strides()));
+    case HloOpcode::kPad:
+      return tensorflow::Hash64Combine(
+          hash, proto_hash(instruction->padding_config()));
+    case HloOpcode::kDot:
+      return tensorflow::Hash64Combine(
+          hash, proto_hash(instruction->dot_dimension_numbers()));
+    case HloOpcode::kConvolution:
+      return tensorflow::Hash64Combine(
+          tensorflow::Hash64Combine(
+              hash, proto_hash(instruction->convolution_dimension_numbers())),
+          proto_hash(instruction->window()));
+    case HloOpcode::kReduceWindow:
+      return tensorflow::Hash64Combine(hash, proto_hash(instruction->window()));
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kTranspose:
+    case HloOpcode::kIota:
+    case HloOpcode::kReduce:
+      return tensorflow::Hash64Combine(hash, c_hash(instruction->dimensions()));
+    default:
+      return hash;
   }
-  return hash;
 }
 
 }  // namespace

From 76562fef92f46624cb6bae475f92c32f3411863e Mon Sep 17 00:00:00 2001
From: Terry Heo <terryheo@google.com>
Date: Thu, 20 Feb 2020 00:20:05 -0800
Subject: [PATCH 337/442] Add Maximum & Minimum op support for GPU delegate

Refactored elementwise op kernel to handle Maximum & Minimum.

PiperOrigin-RevId: 296146084
Change-Id: Iefd333b79638d8705b28167657af475aa75e639a
---
 tensorflow/lite/delegates/gpu/README.md       |   2 +
 .../delegates/gpu/cl/kernels/elementwise.cc   |  93 ++++++++---
 .../delegates/gpu/cl/kernels/elementwise.h    |  14 +-
 .../gpu/cl/kernels/elementwise_test.cc        | 112 +++++++++++++
 .../gpu/cl/selectors/operation_selector.cc    |   8 +-
 .../delegates/gpu/common/model_builder.cc     | 155 +++++++++++++-----
 .../lite/delegates/gpu/common/operations.cc   |   6 +
 .../lite/delegates/gpu/common/operations.h    |  15 +-
 .../delegates/gpu/gl/kernels/elementwise.cc   |  50 +++++-
 .../gpu/gl/kernels/elementwise_test.cc        |  58 +++++++
 .../lite/delegates/gpu/gl/kernels/registry.cc |   2 +
 tensorflow/lite/delegates/gpu/metal/api.cc    |   4 +-
 12 files changed, 445 insertions(+), 74 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/README.md b/tensorflow/lite/delegates/gpu/README.md
index 2b216773c18..42d8e4b2caa 100644
--- a/tensorflow/lite/delegates/gpu/README.md
+++ b/tensorflow/lite/delegates/gpu/README.md
@@ -34,6 +34,8 @@ TFLite on GPU supports the following ops in 16-bit and 32-bit float precision:
 * `LOGISTIC v1`
 * `LSTM v2 (Basic LSTM only)`
 * `MAX_POOL_2D v1`
+* `MAXIMUM v1`
+* `MINIMUM v1`
 * `MUL v1`
 * `PAD v1`
 * `PRELU v1`
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
index b6c6b1409f8..9fb3e45fe81 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
@@ -106,7 +106,9 @@ ElementwiseTwoInput::ElementwiseTwoInput(ElementwiseTwoInput&& operation)
     : ElementwiseOperation(std::move(operation)),
       link_index_(operation.link_index_),
       op_type_(operation.op_type_),
-      broadcast_(operation.broadcast_) {}
+      broadcast_(operation.broadcast_),
+      scalar_para_(operation.scalar_para_),
+      use_scalar_para_(operation.use_scalar_para_) {}
 
 ElementwiseTwoInput& ElementwiseTwoInput::operator=(
     ElementwiseTwoInput&& operation) {
@@ -114,30 +116,43 @@ ElementwiseTwoInput& ElementwiseTwoInput::operator=(
     link_index_ = operation.link_index_;
     op_type_ = operation.op_type_;
     broadcast_ = operation.broadcast_;
+    scalar_para_ = operation.scalar_para_;
+    use_scalar_para_ = operation.use_scalar_para_;
     ElementwiseOperation::operator=(std::move(operation));
   }
   return *this;
 }
 
-void ElementwiseTwoInput::SetLinkIndex(int index) { link_index_ = index; }
+void ElementwiseTwoInput::SetLinkIndex(int index) {
+  link_index_ = index;
+  if (use_scalar_para_) {
+    scalar_para_.SetName(absl::StrCat("scalar_para_", index));
+  }
+}
 
 std::string ElementwiseTwoInput::GetCoreCode(
     const LinkingContext& context) const {
-  const std::string size_name = "src_size_" + std::to_string(link_index_);
-  TensorCodeGenerator src_tensor(
-      absl::StrCat("src_data_", link_index_),
-      WHSPoint{size_name + ".x", size_name + ".y", size_name + ".z"},
-      definition_.src_tensors[1]);
-  const std::string x_coord = broadcast_.width ? "0" : context.x_coord;
-  const std::string y_coord = broadcast_.height ? "0" : context.y_coord;
-  const std::string s_coord = broadcast_.channels ? "0" : context.s_coord;
-  const std::string second_var = "second_var_" + std::to_string(link_index_);
-  std::string result = "  FLT4 " + second_var + " = " +
-                       src_tensor.ReadWHS(x_coord, y_coord, s_coord) + ";\n";
-  if (broadcast_.channels) {
-    result += "  " + second_var + ".y = " + second_var + ".x;\n";
-    result += "  " + second_var + ".z = " + second_var + ".x;\n";
-    result += "  " + second_var + ".w = " + second_var + ".x;\n";
+  std::string result;
+  std::string second_var;
+  if (use_scalar_para_) {
+    second_var = absl::StrCat("(FLT)(", scalar_para_.GetName(), ")");
+  } else {
+    const std::string size_name = "src_size_" + std::to_string(link_index_);
+    TensorCodeGenerator src_tensor(
+        absl::StrCat("src_data_", link_index_),
+        WHSPoint{size_name + ".x", size_name + ".y", size_name + ".z"},
+        definition_.src_tensors[1]);
+    const std::string x_coord = broadcast_.width ? "0" : context.x_coord;
+    const std::string y_coord = broadcast_.height ? "0" : context.y_coord;
+    const std::string s_coord = broadcast_.channels ? "0" : context.s_coord;
+    second_var = "second_var_" + std::to_string(link_index_);
+    result = "  FLT4 " + second_var + " = " +
+             src_tensor.ReadWHS(x_coord, y_coord, s_coord) + ";\n";
+    if (broadcast_.channels) {
+      result += "  " + second_var + ".y = " + second_var + ".x;\n";
+      result += "  " + second_var + ".z = " + second_var + ".x;\n";
+      result += "  " + second_var + ".w = " + second_var + ".x;\n";
+    }
   }
   switch (op_type_) {
     case OperationType::ADD:
@@ -146,6 +161,12 @@ std::string ElementwiseTwoInput::GetCoreCode(
     case OperationType::DIV:
       result += "$0 /= $1;\n";
       break;
+    case OperationType::MAXIMUM:
+      result += "$0 = max($0, $1);\n";
+      break;
+    case OperationType::MINIMUM:
+      result += "$0 = min($0, $1);\n";
+      break;
     case OperationType::MUL:
       result += "$0 *= $1;\n";
       break;
@@ -167,20 +188,44 @@ std::string ElementwiseTwoInput::GetCoreCode(
 
 std::string ElementwiseTwoInput::GetArgsDeclaration() const {
   std::string args;
-  absl::StrAppend(&args, ",\n",
-                  GetTensorDeclaration(AccessType::READ,
-                                       absl::StrCat("src_data_", link_index_),
-                                       definition_.src_tensors[1]));
-  absl::StrAppend(&args, ",\n   int4 src_size_", link_index_);
+  if (use_scalar_para_) {
+    absl::StrAppend(&args, ",\n    ", scalar_para_.GetDeclaration());
+  } else {
+    absl::StrAppend(&args, ",\n",
+                    GetTensorDeclaration(AccessType::READ,
+                                         absl::StrCat("src_data_", link_index_),
+                                         definition_.src_tensors[1]));
+    absl::StrAppend(&args, ",\n   int4 src_size_", link_index_);
+  }
   return args;
 }
 
 Status ElementwiseTwoInput::BindArguments(CLKernel* kernel) {
-  RETURN_IF_ERROR(kernel->SetMemoryAuto(src_[1]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel->SetBytesAuto(src_[1]->GetWBatchedHSB()));
+  if (use_scalar_para_) {
+    RETURN_IF_ERROR(kernel->SetBytesAuto(scalar_para_));
+  } else {
+    RETURN_IF_ERROR(kernel->SetMemoryAuto(src_[1]->GetMemoryPtr()));
+    RETURN_IF_ERROR(kernel->SetBytesAuto(src_[1]->GetWBatchedHSB()));
+  }
   return OkStatus();
 }
 
+ElementwiseTwoInput CreateElementwiseTwoInput(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const OperationType& op_type, const BroadcastSettings& broadcast,
+    const ElementwiseAttributes& attr) {
+  ElementwiseTwoInput operation(definition, op_type, broadcast);
+  auto scalar = absl::get_if<float>(&attr.param);
+  if (scalar) {
+    const auto scalar_precision = creation_context.device->IsPowerVR()
+                                      ? CalculationsPrecision::F32
+                                      : definition.precision;
+    operation.SetScalarPara(FLT(scalar_precision, *scalar));
+  }
+  operation.SetLinkIndex(0);
+  return operation;
+}
+
 ElementwiseTwoInput CreateElementwiseTwoInput(
     const OperationDef& definition, const OperationType& op_type,
     const BroadcastSettings& broadcast) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
index a09ddd1b7db..a70114d1081 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
@@ -63,7 +63,8 @@ class ElementwiseTwoInput : public ElementwiseOperation {
                                const BroadcastSettings& broadcast)
       : ElementwiseOperation(definition),
         op_type_(op_type),
-        broadcast_(broadcast) {}
+        broadcast_(broadcast),
+        use_scalar_para_(false) {}
 
   // Move only
   ElementwiseTwoInput(ElementwiseTwoInput&& operation);
@@ -75,13 +76,24 @@ class ElementwiseTwoInput : public ElementwiseOperation {
   std::string GetCoreCode(const LinkingContext& context) const override;
   std::string GetArgsDeclaration() const override;
   Status BindArguments(CLKernel* kernel) override;
+  inline void SetScalarPara(FLT scalar) {
+    scalar_para_ = scalar;
+    use_scalar_para_ = true;
+  }
 
  private:
   int link_index_;
   OperationType op_type_;
   BroadcastSettings broadcast_;
+  FLT scalar_para_;
+  bool use_scalar_para_;
 };
 
+ElementwiseTwoInput CreateElementwiseTwoInput(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const OperationType& op_type, const BroadcastSettings& broadcast,
+    const ElementwiseAttributes& attr);
+
 ElementwiseTwoInput CreateElementwiseTwoInput(
     const OperationDef& definition, const OperationType& op_type,
     const BroadcastSettings& broadcast);
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
index 24d30eecf25..aa1f83cc495 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
@@ -425,6 +425,118 @@ TEST_F(OpenCLOperationTest, Add) {
   }
 }
 
+TEST_F(OpenCLOperationTest, Maxiumum) {
+  TensorFloat32 src_tensor_0, src_tensor_1;
+  src_tensor_0.shape = BHWC(1, 2, 1, 2);
+  src_tensor_1.shape = BHWC(1, 2, 1, 2);
+  src_tensor_0.data = {0.0f, -6.2f, 2.0f, -3.0f};
+  src_tensor_1.data = {1.0f, 2.0f, 3.0f, -2.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      ElementwiseTwoInput operation =
+          CreateElementwiseTwoInput(op_def, OperationType::MAXIMUM);
+      ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
+                                    creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {1.0f, 2.0f, 3.0f, -2.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, MaxiumumWithScalar) {
+  TensorFloat32 src_tensor_0;
+  src_tensor_0.shape = BHWC(1, 4, 1, 1);
+  src_tensor_0.data = {0.0f, -6.2f, 2.0f, -3.0f};
+
+  ElementwiseAttributes attr;
+  attr.param = -1.0f;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      BroadcastSettings broadcast;
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          creation_context_, op_def, OperationType::MAXIMUM, broadcast, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
+                                    BHWC(1, 4, 1, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, -1.0f, 2.0f, -1.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, Minimum) {
+  TensorFloat32 src_tensor_0, src_tensor_1;
+  src_tensor_0.shape = BHWC(1, 2, 1, 2);
+  src_tensor_1.shape = BHWC(1, 2, 1, 2);
+  src_tensor_0.data = {0.0f, -6.2f, 2.0f, -3.0f};
+  src_tensor_1.data = {1.0f, 2.0f, 3.0f, -2.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      ElementwiseTwoInput operation =
+          CreateElementwiseTwoInput(op_def, OperationType::MINIMUM);
+      ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
+                                    creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, -6.2f, 2.0f, -3.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, MinimumWithScalar) {
+  TensorFloat32 src_tensor_0;
+  src_tensor_0.shape = BHWC(1, 4, 1, 1);
+  src_tensor_0.data = {0.0f, -6.2f, 2.0f, -3.0f};
+
+  ElementwiseAttributes attr;
+  attr.param = -1.0f;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      BroadcastSettings broadcast;
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          creation_context_, op_def, OperationType::MINIMUM, broadcast, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
+                                    BHWC(1, 4, 1, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {-1.0f, -6.2f, -1.0f, -3.0f}));
+    }
+  }
+}
+
 TEST_F(OpenCLOperationTest, Mul) {
   TensorFloat32 src_tensor_0, src_tensor_1;
   src_tensor_0.shape = BHWC(1, 2, 1, 2);
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
index e45a750b2fd..3153d7ddfd8 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
@@ -231,6 +231,8 @@ Status GPUOperationFromNode(const CreationContext& creation_context,
       return OkStatus();
     }
     case OperationType::DIV:
+    case OperationType::MAXIMUM:
+    case OperationType::MINIMUM:
     case OperationType::POW:
     case OperationType::SQUARED_DIFF:
     case OperationType::SUB: {
@@ -238,8 +240,10 @@ Status GPUOperationFromNode(const CreationContext& creation_context,
       broadcast.width = IsWidthBroadcastedForSecondInput(inputs);
       broadcast.height = IsHeightBroadcastedForSecondInput(inputs);
       broadcast.channels = IsChannelsBroadcastedForSecondInput(inputs);
-      ElementwiseTwoInput operation =
-          CreateElementwiseTwoInput(op_def, op_type, broadcast);
+      const auto attr =
+          absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          creation_context, op_def, op_type, broadcast, attr);
       *gpu_op = absl::make_unique<ElementwiseTwoInput>(std::move(operation));
       return OkStatus();
     }
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index fc912f383ec..73d7e8821e8 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -389,6 +389,39 @@ Status CheckInputsOutputs(const TfLiteContext* context,
   return OkStatus();
 }
 
+// The function checks input tensors including 1 constant tensor.
+Status CheckInputsOutputsAllowingOneConstInput(const TfLiteContext* context,
+                                               const TfLiteNode* tflite_node,
+                                               int inputs, int outputs) {
+  int number_of_const_inputs = 0;
+  int number_of_runtime_inputs = 0;
+  for (int i = 0; i < tflite_node->inputs->size; i++) {
+    if (IsConstantTensor(&context->tensors[tflite_node->inputs->data[i]])) {
+      number_of_const_inputs++;
+    } else {
+      number_of_runtime_inputs++;
+    }
+  }
+  if (tflite_node->inputs->size != inputs) {
+    return InternalError(absl::StrFormat(
+        "Expected %d input tensor(s), but node has %d input(s).", inputs,
+        tflite_node->inputs->size));
+  }
+  if (number_of_const_inputs > 1) {
+    return InternalError(absl::StrFormat(
+        "Expected 1 const input tensor, but node has %d const input(s).",
+        number_of_const_inputs));
+  }
+  int runtime_outputs = GetNumberOfRuntimeOutputsForNode(context, tflite_node);
+  if (runtime_outputs != outputs) {
+    return InternalError(
+        absl::StrFormat("Expected %d output tensor(s), but node has %d runtime "
+                        "output(s).",
+                        outputs, runtime_outputs));
+  }
+  return OkStatus();
+}
+
 // A parser responsible for parsing TFLite operation and adding it to a graph.
 class TFLiteOperationParser {
  public:
@@ -642,6 +675,55 @@ Status ExtractTensorShape(const TfLiteTensor& tflite_tensor, BHWC* bhwc) {
   }
 }
 
+Status ParseInputsWithConstTensor(Node* node, ObjectReader* reader,
+                                  TensorOrScalar* tensor_or_scalar) {
+  const std::string& opname = node->operation.type;
+
+  // Determine runtime/constant tensors.
+  const TfLiteTensor* input0 = reader->GetInputTensor(0);
+  if (!input0) {
+    return InvalidArgumentError("Couldn't get the 1st input tensor for " +
+                                opname);
+  }
+  const TfLiteTensor* input1 = reader->GetInputTensor(1);
+  if (!input1) {
+    return InvalidArgumentError("Couldn't get the 2nd input tensor for " +
+                                opname);
+  }
+  const bool constant_tensor0 = IsConstantTensor(input0);
+  const bool constant_tensor1 = IsConstantTensor(input1);
+  if (constant_tensor0 && constant_tensor1) {
+    return InvalidArgumentError("No runtime input tensors for " + opname);
+  }
+  const bool runtime_tensor0 = !constant_tensor0;
+  const bool runtime_tensor1 = !constant_tensor1;
+
+  if (runtime_tensor0 && runtime_tensor1) {
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+    RETURN_IF_ERROR(reader->AddInput(node, 1));
+  } else {
+    int runtime_tensor = 0;
+    int constant_tensor = 1;
+    TfLiteIntArray* constant_dims = input1->dims;
+    if (constant_tensor0 && runtime_tensor1) {
+      runtime_tensor = 1;
+      constant_tensor = 0;
+      constant_dims = input0->dims;
+    }
+    RETURN_IF_ERROR(reader->AddInput(node, runtime_tensor));
+    if (constant_dims->size <= 0) {
+      Tensor<Scalar, DataType::FLOAT32> tensor;
+      RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor));
+      *tensor_or_scalar = tensor.data[0];
+    } else {
+      Tensor<Linear, DataType::FLOAT32> tensor;
+      RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor));
+      *tensor_or_scalar = std::move(tensor);
+    }
+  }
+  return OkStatus();
+}
+
 class AddOperationParser : public TFLiteOperationParser {
  public:
   Status IsSupported(const TfLiteContext* context,
@@ -663,51 +745,11 @@ class AddOperationParser : public TFLiteOperationParser {
     // considers 2 input cases.  The underlying GPU shader programs can accept
     // more inputs, but the logic below would have to be expanded.
 
-    // Determine runtime/constant tensors.
-    const TfLiteTensor* input0 = reader->GetInputTensor(0);
-    if (!input0) {
-      return InvalidArgumentError("Couldn't get the 1st input tensor for ADD.");
-    }
-    const TfLiteTensor* input1 = reader->GetInputTensor(1);
-    if (!input1) {
-      return InvalidArgumentError("Couldn't get the 2nd input tensor for ADD.");
-    }
-    const bool constant_tensor0 = IsConstantTensor(input0);
-    const bool constant_tensor1 = IsConstantTensor(input1);
-    if (constant_tensor0 && constant_tensor1) {
-      return InvalidArgumentError("No runtime input tensors for ADD.");
-    }
-    const bool runtime_tensor0 = !constant_tensor0;
-    const bool runtime_tensor1 = !constant_tensor1;
-
     Node* node = graph->NewNode();
     node->operation.type = ToString(OperationType::ADD);
     RETURN_IF_ERROR(reader->AddOutputs(node));
-
     AddAttributes attr;
-    if (runtime_tensor0 && runtime_tensor1) {
-      RETURN_IF_ERROR(reader->AddInput(node, 0));
-      RETURN_IF_ERROR(reader->AddInput(node, 1));
-    } else {
-      int runtime_tensor = 0;
-      int constant_tensor = 1;
-      TfLiteIntArray* constant_dims = input1->dims;
-      if (constant_tensor0 && runtime_tensor1) {
-        runtime_tensor = 1;
-        constant_tensor = 0;
-        constant_dims = input0->dims;
-      }
-      RETURN_IF_ERROR(reader->AddInput(node, runtime_tensor));
-      if (constant_dims->size <= 0) {
-        Tensor<Scalar, DataType::FLOAT32> tensor;
-        RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor));
-        attr.param = tensor.data[0];
-      } else {
-        Tensor<Linear, DataType::FLOAT32> tensor;
-        RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor));
-        attr.param = std::move(tensor);
-      }
-    }
+    RETURN_IF_ERROR(ParseInputsWithConstTensor(node, reader, &attr.param));
     node->operation.attributes = std::move(attr);
     const auto* tf_options =
         reinterpret_cast<const TfLiteAddParams*>(tflite_node->builtin_data);
@@ -1053,6 +1095,11 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
     } else if (IsTwoArgumentOperation()) {
       RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node, /*inputs=*/2,
                                          /*outputs=*/1));
+    } else if (IsTwoArgumentOperationWithConst()) {
+      RETURN_IF_ERROR(CheckInputsOutputsAllowingOneConstInput(context,
+                                                              tflite_node,
+                                                              /*inputs=*/2,
+                                                              /*outputs=*/1));
     } else {
       return InvalidArgumentError("Op can only handle 1 or 2 operand(s).");
     }
@@ -1103,6 +1150,16 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
         RETURN_IF_ERROR(
             MaybeFuseActivationToTheSingleOutput(activation, graph, node));
       }
+    } else if (IsTwoArgumentOperationWithConst()) {
+      ElementwiseAttributes attr;
+      RETURN_IF_ERROR(ParseInputsWithConstTensor(node, reader, &attr.param));
+      auto const_vector =
+          absl::get_if<::tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
+              &attr.param);
+      if (const_vector) {
+        return InvalidArgumentError("Constant vector is not supported");
+      }
+      node->operation.attributes = std::move(attr);
     } else {
       return InvalidArgumentError("Incorrect operation type passed");
     }
@@ -1161,6 +1218,16 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
     }
   }
 
+  bool IsTwoArgumentOperationWithConst() const {
+    switch (operation_type_) {
+      case OperationType::MINIMUM:
+      case OperationType::MAXIMUM:
+        return true;
+      default:
+        return false;
+    }
+  }
+
   OperationType operation_type_;
 };
 
@@ -2547,10 +2614,16 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
       return absl::make_unique<ElementwiseOperationParser>(OperationType::LOG);
     case kTfLiteBuiltinLstm:
       return absl::make_unique<LSTMOperationParser>();
+    case kTfLiteBuiltinMaximum:
+      return absl::make_unique<ElementwiseOperationParser>(
+          OperationType::MAXIMUM);
     case kTfLiteBuiltinMaxPool2d:
       return absl::make_unique<Pooling2DOperationParser>(PoolingType::MAX);
     case kTfLiteBuiltinMean:
       return absl::make_unique<MeanOperationParser>();
+    case kTfLiteBuiltinMinimum:
+      return absl::make_unique<ElementwiseOperationParser>(
+          OperationType::MINIMUM);
     case kTfLiteBuiltinMirrorPad:
       return absl::make_unique<PadOperationParser>(/*mirror_pad=*/true);
     case kTfLiteBuiltinMul:
diff --git a/tensorflow/lite/delegates/gpu/common/operations.cc b/tensorflow/lite/delegates/gpu/common/operations.cc
index a4b3e2669a0..0d5c3429a49 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.cc
+++ b/tensorflow/lite/delegates/gpu/common/operations.cc
@@ -98,10 +98,14 @@ std::string ToString(enum OperationType op) {
       return "log";
     case OperationType::LSTM:
       return "lstm";
+    case OperationType::MAXIMUM:
+      return "maximum";
     case OperationType::MAX_UNPOOLING_2D:
       return "max_unpooling";
     case OperationType::MEAN:
       return "mean";
+    case OperationType::MINIMUM:
+      return "minimum";
     case OperationType::MUL:
       return "mul";
     case OperationType::PAD:
@@ -165,8 +169,10 @@ OperationType OperationTypeFromString(const std::string& name) {
           {"hard_swish", OperationType::HARD_SWISH},
           {"log", OperationType::LOG},
           {"lstm", OperationType::LSTM},
+          {"maximum", OperationType::MAXIMUM},
           {"max_unpooling", OperationType::MAX_UNPOOLING_2D},
           {"mean", OperationType::MEAN},
+          {"minimum", OperationType::MINIMUM},
           {"mul", OperationType::MUL},
           {"pad", OperationType::PAD},
           {"pooling_2d", OperationType::POOLING_2D},
diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h
index d58c82d4a26..87bb3ec383f 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@@ -47,8 +47,10 @@ enum class OperationType {
   HARD_SWISH,
   LOG,
   LSTM,
+  MAXIMUM,
   MAX_UNPOOLING_2D,
   MEAN,
+  MINIMUM,
   MUL,
   PAD,
   POOLING_2D,
@@ -75,6 +77,9 @@ std::string ToString(enum OperationType op);
 
 OperationType OperationTypeFromString(const std::string& name);
 
+typedef absl::variant<absl::monostate, Tensor<Linear, DataType::FLOAT32>, float>
+    TensorOrScalar;
+
 struct Padding2D {
   Padding2D() = default;
   Padding2D& operator=(const Padding2D& value);
@@ -352,8 +357,7 @@ struct LstmAttributes {
 };
 
 struct MultiplyAttributes {
-  absl::variant<absl::monostate, Tensor<Linear, DataType::FLOAT32>, float>
-      param;
+  TensorOrScalar param;
 };
 
 enum class SamplingType {
@@ -435,8 +439,7 @@ struct SliceAttributes {
 BHWC CalculateOutputShape(const BHWC& input, const SliceAttributes& attr);
 
 struct AddAttributes {
-  absl::variant<absl::monostate, Tensor<Linear, DataType::FLOAT32>, float>
-      param;
+  TensorOrScalar param;
 };
 
 struct FullyConnectedAttributes {
@@ -452,6 +455,10 @@ BHWC CalculateOutputShape(const BHWC& input,
 // @return shape of a tensor after Mean operation is applied to the given input.
 BHWC CalculateOutputShape(const BHWC& input, const MeanAttributes& attr);
 
+struct ElementwiseAttributes {
+  TensorOrScalar param;
+};
+
 struct ReshapeAttributes {
   BHWC new_shape;
 };
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
index 9215eac7602..7ba2dd871e7 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
@@ -139,6 +139,14 @@ class ElementwiseTwoArguments : public NodeShader {
         source = "value_0 /= value_1;";
         break;
       }
+      case OperationType::MAXIMUM: {
+        source = "value_0 = max(value_0, value_1);";
+        break;
+      }
+      case OperationType::MINIMUM: {
+        source = "value_0 = min(value_0, value_1);";
+        break;
+      }
       case OperationType::POW: {
         // From documentation :
         // The result is undefined if x<0 or if x=0 and y≤0.
@@ -167,6 +175,37 @@ class ElementwiseTwoArguments : public NodeShader {
     return OkStatus();
   }
 
+  Status ImplementElementwiseWithScalar(const GenerationContext& ctx,
+                                        const float scalar,
+                                        GeneratedCode* generated_code) const {
+    std::string source;
+    switch (operation_type_) {
+      case OperationType::MAXIMUM: {
+        source = "value_0 = max(value_0, $scalar$);";
+        break;
+      }
+      case OperationType::MINIMUM: {
+        source = "value_0 = min(value_0, $scalar$);";
+        break;
+      }
+
+      default:
+        return InvalidArgumentError(
+            "Incorrect elementwise with scalar operation type.");
+    }
+    *generated_code = {
+        /*parameters=*/{{"scalar", scalar}},
+        /*objects=*/{},
+        /*shared_variables=*/{},
+        /*workload=*/uint3(),
+        /*workgroup=*/uint3(),
+        /*source_code=*/source,
+        /*input=*/IOStructure::AUTO,
+        /*output=*/IOStructure::AUTO,
+    };
+    return OkStatus();
+  }
+
   bool IsSupportedBroadcast(const GenerationContext& ctx) const {
     auto inputs = ctx.graph->FindInputs(ctx.node->id);
     auto outputs = ctx.graph->FindOutputs(ctx.node->id);
@@ -219,8 +258,15 @@ class ElementwiseTwoArguments : public NodeShader {
     if (IsSupportedBroadcast(ctx)) {
       return ImplementElementwiseBroadcast(ctx, generated_code);
     }
+    auto attr =
+        absl::any_cast<ElementwiseAttributes>(ctx.node->operation.attributes);
+    auto scalar = absl::get_if<float>(&attr.param);
+    if (scalar) {
+      return ImplementElementwiseWithScalar(ctx, *scalar, generated_code);
+    }
     return InvalidArgumentError(
-        "This case is not supported by subtract operation");
+        "This case is not supported by elementwise with two arguments "
+        "operation");
   }
 
  private:
@@ -244,6 +290,8 @@ std::unique_ptr<NodeShader> NewElementwiseNodeShader(
     case OperationType::TANH:
       return absl::make_unique<ElementwiseOneArgument>(operation_type);
     case OperationType::DIV:
+    case OperationType::MAXIMUM:
+    case OperationType::MINIMUM:
     case OperationType::POW:
     case OperationType::SQUARED_DIFF:
     case OperationType::SUB:
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc
index 6743664f7e2..e597cc898e9 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc
@@ -100,6 +100,64 @@ TEST(ElementwiseTest, Log) {
               Pointwise(FloatNear(1e-6), {0.0, 1.14473, 0.0, 0.0}));
 }
 
+TEST(ElementwiseTest, Maximum) {
+  OperationType op_type = OperationType::MAXIMUM;
+  const BHWC shape(1, 2, 2, 1);
+  SingleOpModel model(
+      {/*type=*/ToString(op_type), /*attributes=*/{}},
+      /*inputs=*/{GetTensorRef(0, shape), GetTensorRef(1, shape)},
+      /*outputs=*/{GetTensorRef(2, shape)});
+  ASSERT_TRUE(model.PopulateTensor(0, {0.0, -6.2, 2.0, -3.0}));
+  ASSERT_TRUE(model.PopulateTensor(1, {1.0, 2.0, 3.0, -2.0}));
+  ASSERT_OK(model.Invoke(*NewElementwiseNodeShader(op_type)));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {1.0, 2.0, 3.0, -2.0}));
+}
+
+TEST(ElementwiseTest, MaximumWithScalar) {
+  OperationType op_type = OperationType::MAXIMUM;
+  const BHWC shape(1, 2, 2, 1);
+  ElementwiseAttributes attr;
+  attr.param = -1.0f;
+  SingleOpModel model(
+      {/*type=*/ToString(op_type), /*attributes=*/std::move(attr)},
+      /*inputs=*/{GetTensorRef(0, shape)},
+      /*outputs=*/{GetTensorRef(2, shape)});
+  ASSERT_TRUE(model.PopulateTensor(0, {0.0, -6.2, 2.0, -3.0}));
+  ASSERT_OK(model.Invoke(*NewElementwiseNodeShader(op_type)));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {0.0, -1.0, 2.0, -1.0}));
+}
+
+TEST(ElementwiseTest, Minimum) {
+  OperationType op_type = OperationType::MINIMUM;
+  const BHWC shape(1, 2, 2, 1);
+  SingleOpModel model(
+      {/*type=*/ToString(op_type), /*attributes=*/{}},
+      /*inputs=*/{GetTensorRef(0, shape), GetTensorRef(1, shape)},
+      /*outputs=*/{GetTensorRef(2, shape)});
+  ASSERT_TRUE(model.PopulateTensor(0, {0.0, -6.2, 2.0, -3.0}));
+  ASSERT_TRUE(model.PopulateTensor(1, {1.0, 2.0, 3.0, -2.0}));
+  ASSERT_OK(model.Invoke(*NewElementwiseNodeShader(op_type)));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {0.0, -6.2, 2.0, -3.0}));
+}
+
+TEST(ElementwiseTest, MinimumWithScalar) {
+  OperationType op_type = OperationType::MINIMUM;
+  const BHWC shape(1, 2, 2, 1);
+  ElementwiseAttributes attr;
+  attr.param = -1.0f;
+  SingleOpModel model(
+      {/*type=*/ToString(op_type), /*attributes=*/std::move(attr)},
+      /*inputs=*/{GetTensorRef(0, shape)},
+      /*outputs=*/{GetTensorRef(2, shape)});
+  ASSERT_TRUE(model.PopulateTensor(0, {0.0, -6.2, 2.0, -3.0}));
+  ASSERT_OK(model.Invoke(*NewElementwiseNodeShader(op_type)));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {-1.0, -6.2, -1.0, -3.0}));
+}
+
 TEST(ElementwiseTest, Pow) {
   OperationType op_type = OperationType::POW;
   const BHWC shape(1, 2, 2, 1);
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc b/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
index 005aa7dfd38..924f7dbf1ec 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
@@ -96,6 +96,8 @@ class Registry : public NodeShader {
     insert_elementwise_op(Type::DIV);
     insert_elementwise_op(Type::HARD_SWISH);
     insert_elementwise_op(Type::LOG);
+    insert_elementwise_op(Type::MAXIMUM);
+    insert_elementwise_op(Type::MINIMUM);
     insert_elementwise_op(Type::POW);
     insert_elementwise_op(Type::RSQRT);
     insert_elementwise_op(Type::SIGMOID);
diff --git a/tensorflow/lite/delegates/gpu/metal/api.cc b/tensorflow/lite/delegates/gpu/metal/api.cc
index 8cf7e34a523..b7179cb98f5 100644
--- a/tensorflow/lite/delegates/gpu/metal/api.cc
+++ b/tensorflow/lite/delegates/gpu/metal/api.cc
@@ -266,10 +266,12 @@ Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
     case OperationType::TANH:
       *tasks = ElementwiseWithOneInput(node_id, inputs[0], outputs[0], op_type);
       break;
-    case OperationType::SUB:
     case OperationType::DIV:
+    case OperationType::MAXIMUM:
+    case OperationType::MINIMUM:
     case OperationType::POW:
     case OperationType::SQUARED_DIFF:
+    case OperationType::SUB:
       *tasks = ElementwiseWithTwoInputs(node_id, inputs, outputs[0], op_type);
       break;
     case OperationType::BATCH_NORMALIZATION:

From 6343b77f134b28e2d6821d77dc471e62208f616d Mon Sep 17 00:00:00 2001
From: Terry Heo <terryheo@google.com>
Date: Thu, 20 Feb 2020 00:47:06 -0800
Subject: [PATCH 338/442] Add Maximum & Minimum op support for Metal

PiperOrigin-RevId: 296149175
Change-Id: I3d26f756cb8f5fe0d94fac3f8515da8b2124dcc4
---
 tensorflow/lite/delegates/gpu/metal/api.cc    |  9 ++-
 .../gpu/metal/kernels/elementwise.cc          | 47 +++++++++------
 .../delegates/gpu/metal/kernels/elementwise.h |  2 +-
 .../gpu/metal/kernels/elementwise_test.mm     | 58 +++++++++++++++++++
 4 files changed, 94 insertions(+), 22 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/metal/api.cc b/tensorflow/lite/delegates/gpu/metal/api.cc
index b7179cb98f5..802697ee9a9 100644
--- a/tensorflow/lite/delegates/gpu/metal/api.cc
+++ b/tensorflow/lite/delegates/gpu/metal/api.cc
@@ -271,9 +271,12 @@ Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
     case OperationType::MINIMUM:
     case OperationType::POW:
     case OperationType::SQUARED_DIFF:
-    case OperationType::SUB:
-      *tasks = ElementwiseWithTwoInputs(node_id, inputs, outputs[0], op_type);
-      break;
+    case OperationType::SUB: {
+      const ElementwiseAttributes* attr =
+          absl::any_cast<ElementwiseAttributes>(&node->operation.attributes);
+      *tasks =
+          ElementwiseWithTwoInputs(node_id, inputs, outputs[0], op_type, attr);
+    } break;
     case OperationType::BATCH_NORMALIZATION:
     case OperationType::BATCH_TO_SPACE:
     case OperationType::CONST:
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc
index 2ce30231e9f..7a93fc6d670 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/metal/kernels/elementwise.h"
 
+#include <cstddef>
 #include <unordered_map>
 #include <vector>
 
@@ -29,7 +30,8 @@ namespace metal {
 namespace {
 
 std::string GetElementwiseWithTwoInputsCode(int src_count,
-                                            OperationType op_type) {
+                                            OperationType op_type,
+                                            const float* scalar) {
   std::string code = R"(
     #include <metal_stdlib>
     using namespace metal;
@@ -49,33 +51,38 @@ std::string GetElementwiseWithTwoInputsCode(int src_count,
 
       int linear_index = (int(gid.z) * params.src_size.y + int(gid.y)) *
         params.src_size.x + int(gid.x);
-        )";
+      FLT4 src_0 = src_buffer0[linear_index];
+  )";
 
+  if (scalar == nullptr) {
+    code += "     FLT4 src_1 = src_buffer1[linear_index];";
+  } else {
+    code +=
+        absl::StrCat("     FLT4 src_1 = FLT4(", std::to_string(*scalar), ");");
+  }
   switch (op_type) {
     case OperationType::DIV: {
-      code +=
-          " FLT4 value = src_buffer0[linear_index] / "
-          "src_buffer1[linear_index];";
+      code += " FLT4 value = src_0 / src_1;";
+      break;
+    }
+    case OperationType::MAXIMUM: {
+      code += " FLT4 value = max(src_0, src_1);";
+      break;
+    }
+    case OperationType::MINIMUM: {
+      code += " FLT4 value = min(src_0, src_1);";
       break;
     }
     case OperationType::POW: {
-      code +=
-          " FLT4 value = pow(src_buffer0[linear_index], "
-          "src_buffer1[linear_index]);";
+      code += " FLT4 value = pow(src_0, src_1);";
       break;
     }
     case OperationType::SQUARED_DIFF: {
-      code += R"(
-     FLT4 src_0 = src_buffer0[linear_index];
-     FLT4 src_1 = src_buffer1[linear_index];
-     FLT4 value = (src_0 - src_1) * (src_0 - src_1);
-   )";
+      code += " FLT4 value = (src_0 - src_1) * (src_0 - src_1);";
       break;
     }
     case OperationType::SUB: {
-      code +=
-          " FLT4 value = src_buffer0[linear_index] - "
-          "src_buffer1[linear_index];";
+      code += " FLT4 value = src_0 - src_1;";
       break;
     }
     default: {
@@ -92,12 +99,16 @@ std::string GetElementwiseWithTwoInputsCode(int src_count,
 
 std::vector<ComputeTaskDescriptorPtr> ElementwiseWithTwoInputs(
     int id, std::vector<ValueId> input_ids, ValueId output_id,
-    OperationType op_type) {
+    OperationType op_type, const ElementwiseAttributes* attr) {
+  const float* scalar = nullptr;
+  if (attr) {
+    scalar = absl::get_if<float>(&attr->param);
+  }
   auto desc = std::make_shared<ComputeTaskDescriptor>();
   desc->id = id;
   desc->is_linkable = false;
   desc->shader_source =
-      GetElementwiseWithTwoInputsCode(input_ids.size(), op_type);
+      GetElementwiseWithTwoInputsCode(input_ids.size(), op_type, scalar);
 
   for (int i = 0; i < input_ids.size(); ++i) {
     const std::string buffer_name =
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.h b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.h
index c8cee339d1b..af70e433e79 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.h
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.h
@@ -27,7 +27,7 @@ namespace metal {
 
 std::vector<ComputeTaskDescriptorPtr> ElementwiseWithTwoInputs(
     int id, std::vector<ValueId> input_ids, ValueId output_id,
-    OperationType op_type);
+    OperationType op_type, const ElementwiseAttributes* attr);
 
 std::vector<ComputeTaskDescriptorPtr> ElementwiseWithOneInput(
     int id, ValueId input_id, ValueId output_id, OperationType op_type);
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm
index deaedb519a2..c70fd7368de 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm
@@ -118,6 +118,64 @@ TensorRef<BHWC> GetTensorRef(int ref, const BHWC& shape) {
   XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());
 }
 
+- (void)testMaximum {
+  OperationType op_type = OperationType::MAXIMUM;
+  const BHWC shape(1, 2, 2, 1);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape), GetTensorRef(1, shape)},
+                      /*outputs=*/{GetTensorRef(2, shape)});
+  XCTAssertTrue(model.PopulateTensor(0, {0.0, -6.2, 2.0, -3.0}));
+  XCTAssertTrue(model.PopulateTensor(1, {1.0, 2.0, 3.0, -2.0}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());
+  status = CompareVectors({1.0, 2.0, 3.0, -2.0}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());
+}
+
+- (void)testMaximumWithScalar {
+  OperationType op_type = OperationType::MAXIMUM;
+  const BHWC shape(1, 2, 2, 1);
+  tflite::gpu::ElementwiseAttributes attr;
+  attr.param = -1.0f;
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/attr},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  XCTAssertTrue(model.PopulateTensor(0, {0.0, -6.2, 2.0, -3.0}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());
+  status = CompareVectors({0.0, -1.0, 2.0, -1.0}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());
+}
+
+- (void)testMinimum {
+  OperationType op_type = OperationType::MINIMUM;
+  const BHWC shape(1, 2, 2, 1);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape), GetTensorRef(1, shape)},
+                      /*outputs=*/{GetTensorRef(2, shape)});
+  XCTAssertTrue(model.PopulateTensor(0, {0.0, -6.2, 2.0, -3.0}));
+  XCTAssertTrue(model.PopulateTensor(1, {1.0, 2.0, 3.0, -2.0}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());
+  status = CompareVectors({0.0, -6.2, 2.0, -3.0}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());
+}
+
+- (void)testMinimumWithScalar {
+  OperationType op_type = OperationType::MINIMUM;
+  const BHWC shape(1, 2, 2, 1);
+  tflite::gpu::ElementwiseAttributes attr;
+  attr.param = -1.0f;
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/attr},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  XCTAssertTrue(model.PopulateTensor(0, {0.0, -6.2, 2.0, -3.0}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());
+  status = CompareVectors({-1.0, -6.2, -1.0, -3.0}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());
+}
+
 - (void)testPow {
   OperationType op_type = OperationType::POW;
   const BHWC shape(1, 2, 2, 1);

From 0a04d3e52d0cc45764437237daef7286e7c67bc4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 20 Feb 2020 00:47:43 -0800
Subject: [PATCH 339/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 296149237 Change-Id:
 I9d0bdb02115df83422421306024dd5255a320768

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 449a95765a5..ecdce1e627b 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21345,7 +21345,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22053,7 +22053,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22249,7 +22249,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22318,7 +22318,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22433,7 +22433,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22492,7 +22492,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22666,7 +22666,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22857,7 +22857,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25297,7 +25297,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25629,7 +25629,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25679,7 +25679,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25929,7 +25929,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26559,7 +26559,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27624,7 +27624,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45536,7 +45536,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 3bc949d1e3c841de884fc20e7527ddf9398b816b Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Thu, 20 Feb 2020 00:58:38 -0800
Subject: [PATCH 340/442] Add a __init__.py file for tensorflow/compiler/tests.

This is recommended standard, even though it looks like
bazel automatically adds this if it is missing.

PiperOrigin-RevId: 296150499
Change-Id: Iadf194be068d81aa07f4365b1057f3b7c28a6190
---
 tensorflow/compiler/tests/BUILD       | 5 ++++-
 tensorflow/compiler/tests/__init__.py | 0
 2 files changed, 4 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/compiler/tests/__init__.py

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index e3a62b3fa7b..203ef51c842 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -61,7 +61,10 @@ py_library(
 py_library(
     name = "test_utils",
     testonly = 1,
-    srcs = ["test_utils.py"],
+    srcs = [
+        "__init__.py",
+        "test_utils.py",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         "//third_party/py/numpy",
diff --git a/tensorflow/compiler/tests/__init__.py b/tensorflow/compiler/tests/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d

From 2b418823ef9dab4a311d5a4ea5bc9d11be40039b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 20 Feb 2020 01:02:33 -0800
Subject: [PATCH 341/442] compat: Update forward compatibility horizon to
 2020-02-20

PiperOrigin-RevId: 296151051
Change-Id: I570dbed58cefe274637462ca4d160abf36fac313
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index c6b49129920..e4638ead571 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -31,7 +31,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 2, 19)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 2, 20)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 8c88f62d83db2ed771fccd7877e2ebb6855f8d19 Mon Sep 17 00:00:00 2001
From: Jakob Buchgraber <buchgr@google.com>
Date: Thu, 20 Feb 2020 01:15:30 -0800
Subject: [PATCH 342/442] preconfig: remove checked in rocm and cuda10.0
 configs as these configurations are placed by remote config
 PiperOrigin-RevId: 296152981 Change-Id:
 Ib19efb421a63a3b2e87d862eda3a34816e8a3890

---
 tensorflow/opensource_only.files              |    7 -
 third_party/toolchains/BUILD                  |   26 -
 .../toolchains/preconfig/generate/BUILD       |   20 -
 .../ubuntu16.04/cuda10.0-cudnn7/WORKSPACE     |    2 -
 .../ubuntu16.04/cuda10.0-cudnn7/cuda/BUILD    | 1272 --------------
 .../cuda10.0-cudnn7/cuda/build_defs.bzl       |   76 -
 .../cuda10.0-cudnn7/cuda/cuda/cuda_config.h   |   27 -
 .../preconfig/ubuntu16.04/py3_opt/BUILD       |  209 ---
 .../preconfig/ubuntu16.04/py3_opt/WORKSPACE   |    2 -
 .../preconfig/ubuntu16.04/rocm/WORKSPACE      |    2 -
 .../preconfig/ubuntu16.04/rocm/rocm/BUILD     | 1512 -----------------
 .../ubuntu16.04/rocm/rocm/build_defs.bzl      |   44 -
 .../ubuntu16.04/rocm/rocm/rocm/rocm_config.h  |   21 -
 .../preconfig/ubuntu16.04/tensorrt5.1/BUILD   |   63 -
 .../preconfig/ubuntu16.04/tensorrt5.1/LICENSE |  203 ---
 .../ubuntu16.04/tensorrt5.1/WORKSPACE         |    2 -
 .../ubuntu16.04/tensorrt5.1/build_defs.bzl    |    5 -
 .../tensorrt/include/tensorrt_config.h        |   21 -
 18 files changed, 3514 deletions(-)
 delete mode 100644 third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/WORKSPACE
 delete mode 100755 third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/cuda/BUILD
 delete mode 100755 third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/cuda/build_defs.bzl
 delete mode 100755 third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
 delete mode 100755 third_party/toolchains/preconfig/ubuntu16.04/py3_opt/BUILD
 delete mode 100644 third_party/toolchains/preconfig/ubuntu16.04/py3_opt/WORKSPACE
 delete mode 100644 third_party/toolchains/preconfig/ubuntu16.04/rocm/WORKSPACE
 delete mode 100755 third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/BUILD
 delete mode 100755 third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/build_defs.bzl
 delete mode 100755 third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/rocm/rocm_config.h
 delete mode 100755 third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/BUILD
 delete mode 100755 third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/LICENSE
 delete mode 100644 third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/WORKSPACE
 delete mode 100755 third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/build_defs.bzl
 delete mode 100755 third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/tensorrt/include/tensorrt_config.h

diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 026f2675737..bba10464933 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -247,8 +247,6 @@ tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/cc_toolchain_confi
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/dummy_toolchain.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang_manylinux2010-cuda10.0/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang_manylinux2010-cuda10.0/cc_toolchain_config.bzl
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/cuda/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/cuda/build_defs.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/cuda10.1-cudnn7/cuda/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/cuda10.1-cudnn7/cuda/build_defs.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc5-rocm/BUILD
@@ -262,11 +260,6 @@ tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/cc_to
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010/dummy_toolchain.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/py/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/py3_opt/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/build_defs.bzl
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/build_defs.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt6.0/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt6.0/build_defs.bzl
diff --git a/third_party/toolchains/BUILD b/third_party/toolchains/BUILD
index 4182b0010dc..a9c3ce3b4de 100644
--- a/third_party/toolchains/BUILD
+++ b/third_party/toolchains/BUILD
@@ -74,19 +74,6 @@ platform(
     },
 )
 
-# Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu16.04-manylinux2010.
-platform(
-    name = "rbe_cuda10.0-cudnn7-ubuntu16.04-manylinux2010",
-    constraint_values = [
-        "@bazel_tools//platforms:x86_64",
-        "@bazel_tools//platforms:linux",
-    ],
-    exec_properties = {
-        "container-image": "docker://gcr.io/tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu16.04-manylinux2010@%s" % container_digests["cuda10.0-cudnn7-ubuntu16.04-manylinux2010"],
-        "Pool": "default",
-    },
-)
-
 # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010.
 platform(
     name = "rbe_cuda10.1-cudnn7-ubuntu16.04-manylinux2010",
@@ -99,16 +86,3 @@ platform(
         "Pool": "default",
     },
 )
-
-# Built with //tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04
-platform(
-    name = "rbe_rocm-ubuntu16.04",
-    constraint_values = [
-        "@bazel_tools//platforms:x86_64",
-        "@bazel_tools//platforms:linux",
-    ],
-    exec_properties = {
-        "container-image": "docker://gcr.io/tensorflow-testing/nosla-rocm-ubuntu16.04@%s" % container_digests["rocm-ubuntu16.04"],
-        "Pool": "default",
-    },
-)
diff --git a/third_party/toolchains/preconfig/generate/BUILD b/third_party/toolchains/preconfig/generate/BUILD
index a73f21416f9..652279f4af1 100644
--- a/third_party/toolchains/preconfig/generate/BUILD
+++ b/third_party/toolchains/preconfig/generate/BUILD
@@ -91,18 +91,6 @@ tensorflow_rbe_config(
     python_version = "3.6",
 )
 
-tensorflow_rbe_config(
-    name = "ubuntu16.04-py3-gcc7_manylinux2010-cuda10.0-cudnn7-tensorrt5.1",
-    compiler = "/dt7/usr/bin/gcc",
-    compiler_prefix = "/usr/bin",
-    cuda_version = "10.0",
-    cudnn_version = "7",
-    os = "ubuntu16.04-manylinux2010",
-    python_version = "3",
-    tensorrt_install_path = "/usr",
-    tensorrt_version = "5.1",
-)
-
 tensorflow_rbe_config(
     name = "ubuntu16.04-py3-clang_manylinux2010-cuda10.0-cudnn7-tensorrt5.1",
     compiler = "/clang_r373795/bin/clang",
@@ -138,11 +126,3 @@ tensorflow_rbe_config(
     tensorrt_install_path = "/usr",
     tensorrt_version = "6.0",
 )
-
-tensorflow_rbe_config(
-    name = "ubuntu16.04-py3_opt-gcc5-rocm",
-    compiler = "gcc",
-    os = "ubuntu16.04",
-    python_version = "3",
-    rocm_version = "2.5",  # Any version will do.
-)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/WORKSPACE b/third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/WORKSPACE
deleted file mode 100644
index b61f572d6d2..00000000000
--- a/third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for cuda_configure rule
-workspace(name = "local_config_cuda")
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/cuda/BUILD b/third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/cuda/BUILD
deleted file mode 100755
index a301d1f382b..00000000000
--- a/third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/cuda/BUILD
+++ /dev/null
@@ -1,1272 +0,0 @@
-load(":build_defs.bzl", "cuda_header_library")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
-
-package(default_visibility = ["//visibility:public"])
-
-config_setting(
-    name = "using_nvcc",
-    values = {
-        "define": "using_cuda_nvcc=true",
-    },
-)
-
-config_setting(
-    name = "using_clang",
-    values = {
-        "define": "using_cuda_clang=true",
-    },
-)
-
-# Equivalent to using_clang && -c opt.
-config_setting(
-    name = "using_clang_opt",
-    values = {
-        "define": "using_cuda_clang=true",
-        "compilation_mode": "opt",
-    },
-)
-
-config_setting(
-    name = "darwin",
-    values = {"cpu": "darwin"},
-)
-
-config_setting(
-    name = "freebsd",
-    values = {"cpu": "freebsd"},
-)
-
-cuda_header_library(
-    name = "cuda_headers",
-    hdrs = [
-        "cuda/cuda_config.h",
-        ":cuda-include",
-    ],
-    include_prefix = "third_party/gpus",
-    includes = [
-        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
-        "cuda/include",
-    ],
-)
-
-cc_library(
-    name = "cudart_static",
-    srcs = ["cuda/lib/libcudart_static.a"],
-    linkopts = select({
-        ":freebsd": [],
-        "//conditions:default": ["-ldl"],
-    }) + [
-        "-lpthread",
-        "-lrt",
-    ],
-)
-
-cc_library(
-    name = "cuda_driver",
-    srcs = ["cuda/lib/libcuda.so"],
-)
-
-cc_library(
-    name = "cudart",
-    srcs = ["cuda/lib/libcudart.so.10.0"],
-    data = ["cuda/lib/libcudart.so.10.0"],
-    linkstatic = 1,
-)
-
-cuda_header_library(
-    name = "cublas_headers",
-    hdrs = [":cublas-include"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["cublas/include"],
-    strip_include_prefix = "cublas/include",
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cublas",
-    srcs = ["cuda/lib/libcublas.so.10.0"],
-    data = ["cuda/lib/libcublas.so.10.0"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cusolver",
-    srcs = ["cuda/lib/libcusolver.so.10.0"],
-    data = ["cuda/lib/libcusolver.so.10.0"],
-    linkopts = ["-lgomp"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cudnn",
-    srcs = ["cuda/lib/libcudnn.so.7"],
-    data = ["cuda/lib/libcudnn.so.7"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cudnn_header",
-    hdrs = [":cudnn-include"],
-    include_prefix = "third_party/gpus/cudnn",
-    strip_include_prefix = "cudnn/include",
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cufft",
-    srcs = ["cuda/lib/libcufft.so.10.0"],
-    data = ["cuda/lib/libcufft.so.10.0"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "curand",
-    srcs = ["cuda/lib/libcurand.so.10.0"],
-    data = ["cuda/lib/libcurand.so.10.0"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cuda",
-    deps = [
-        ":cublas",
-        ":cuda_headers",
-        ":cudart",
-        ":cudnn",
-        ":cufft",
-        ":curand",
-    ],
-)
-
-cuda_header_library(
-    name = "cupti_headers",
-    hdrs = [":cuda-extras"],
-    include_prefix = "third_party/gpus",
-    includes = ["cuda/extras/CUPTI/include/"],
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cupti_dsos",
-    data = ["cuda/lib/libcupti.so.10.0"],
-)
-
-cc_library(
-    name = "cusparse",
-    srcs = ["cuda/lib/libcusparse.so.10.0"],
-    data = ["cuda/lib/libcusparse.so.10.0"],
-    linkopts = ["-lgomp"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "libdevice_root",
-    data = [":cuda-nvvm"],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    deps = [
-        "@bazel_skylib//lib:selects",
-    ],
-)
-
-genrule(
-    name = "cuda-include",
-    outs = [
-        "cuda/include/CL/cl.h",
-        "cuda/include/CL/cl.hpp",
-        "cuda/include/CL/cl_egl.h",
-        "cuda/include/CL/cl_ext.h",
-        "cuda/include/CL/cl_gl.h",
-        "cuda/include/CL/cl_gl_ext.h",
-        "cuda/include/CL/cl_platform.h",
-        "cuda/include/CL/opencl.h",
-        "cuda/include/builtin_types.h",
-        "cuda/include/channel_descriptor.h",
-        "cuda/include/common_functions.h",
-        "cuda/include/cooperative_groups.h",
-        "cuda/include/cooperative_groups_helpers.h",
-        "cuda/include/crt/common_functions.h",
-        "cuda/include/crt/device_double_functions.h",
-        "cuda/include/crt/device_double_functions.hpp",
-        "cuda/include/crt/device_functions.h",
-        "cuda/include/crt/device_functions.hpp",
-        "cuda/include/crt/func_macro.h",
-        "cuda/include/crt/host_config.h",
-        "cuda/include/crt/host_defines.h",
-        "cuda/include/crt/host_runtime.h",
-        "cuda/include/crt/math_functions.h",
-        "cuda/include/crt/math_functions.hpp",
-        "cuda/include/crt/mma.h",
-        "cuda/include/crt/mma.hpp",
-        "cuda/include/crt/nvfunctional",
-        "cuda/include/crt/sm_70_rt.h",
-        "cuda/include/crt/sm_70_rt.hpp",
-        "cuda/include/crt/storage_class.h",
-        "cuda/include/cuComplex.h",
-        "cuda/include/cublas.h",
-        "cuda/include/cublasXt.h",
-        "cuda/include/cublas_api.h",
-        "cuda/include/cublas_v2.h",
-        "cuda/include/cuda.h",
-        "cuda/include/cudaEGL.h",
-        "cuda/include/cudaGL.h",
-        "cuda/include/cudaProfiler.h",
-        "cuda/include/cudaVDPAU.h",
-        "cuda/include/cuda_device_runtime_api.h",
-        "cuda/include/cuda_egl_interop.h",
-        "cuda/include/cuda_fp16.h",
-        "cuda/include/cuda_fp16.hpp",
-        "cuda/include/cuda_gl_interop.h",
-        "cuda/include/cuda_occupancy.h",
-        "cuda/include/cuda_profiler_api.h",
-        "cuda/include/cuda_runtime.h",
-        "cuda/include/cuda_runtime_api.h",
-        "cuda/include/cuda_surface_types.h",
-        "cuda/include/cuda_texture_types.h",
-        "cuda/include/cuda_vdpau_interop.h",
-        "cuda/include/cudalibxt.h",
-        "cuda/include/cudart_platform.h",
-        "cuda/include/cufft.h",
-        "cuda/include/cufftXt.h",
-        "cuda/include/cufftw.h",
-        "cuda/include/curand.h",
-        "cuda/include/curand_discrete.h",
-        "cuda/include/curand_discrete2.h",
-        "cuda/include/curand_globals.h",
-        "cuda/include/curand_kernel.h",
-        "cuda/include/curand_lognormal.h",
-        "cuda/include/curand_mrg32k3a.h",
-        "cuda/include/curand_mtgp32.h",
-        "cuda/include/curand_mtgp32_host.h",
-        "cuda/include/curand_mtgp32_kernel.h",
-        "cuda/include/curand_mtgp32dc_p_11213.h",
-        "cuda/include/curand_normal.h",
-        "cuda/include/curand_normal_static.h",
-        "cuda/include/curand_philox4x32_x.h",
-        "cuda/include/curand_poisson.h",
-        "cuda/include/curand_precalc.h",
-        "cuda/include/curand_uniform.h",
-        "cuda/include/cusolverDn.h",
-        "cuda/include/cusolverRf.h",
-        "cuda/include/cusolverSp.h",
-        "cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h",
-        "cuda/include/cusolver_common.h",
-        "cuda/include/cusparse.h",
-        "cuda/include/cusparse_v2.h",
-        "cuda/include/device_atomic_functions.h",
-        "cuda/include/device_atomic_functions.hpp",
-        "cuda/include/device_double_functions.h",
-        "cuda/include/device_functions.h",
-        "cuda/include/device_launch_parameters.h",
-        "cuda/include/device_types.h",
-        "cuda/include/driver_functions.h",
-        "cuda/include/driver_types.h",
-        "cuda/include/fatBinaryCtl.h",
-        "cuda/include/fatbinary.h",
-        "cuda/include/host_config.h",
-        "cuda/include/host_defines.h",
-        "cuda/include/library_types.h",
-        "cuda/include/math_constants.h",
-        "cuda/include/math_functions.h",
-        "cuda/include/mma.h",
-        "cuda/include/npp.h",
-        "cuda/include/nppcore.h",
-        "cuda/include/nppdefs.h",
-        "cuda/include/nppi.h",
-        "cuda/include/nppi_arithmetic_and_logical_operations.h",
-        "cuda/include/nppi_color_conversion.h",
-        "cuda/include/nppi_compression_functions.h",
-        "cuda/include/nppi_computer_vision.h",
-        "cuda/include/nppi_data_exchange_and_initialization.h",
-        "cuda/include/nppi_filtering_functions.h",
-        "cuda/include/nppi_geometry_transforms.h",
-        "cuda/include/nppi_linear_transforms.h",
-        "cuda/include/nppi_morphological_operations.h",
-        "cuda/include/nppi_statistics_functions.h",
-        "cuda/include/nppi_support_functions.h",
-        "cuda/include/nppi_threshold_and_compare_operations.h",
-        "cuda/include/npps.h",
-        "cuda/include/npps_arithmetic_and_logical_operations.h",
-        "cuda/include/npps_conversion_functions.h",
-        "cuda/include/npps_filtering_functions.h",
-        "cuda/include/npps_initialization.h",
-        "cuda/include/npps_statistics_functions.h",
-        "cuda/include/npps_support_functions.h",
-        "cuda/include/nppversion.h",
-        "cuda/include/nvToolsExt.h",
-        "cuda/include/nvToolsExtCuda.h",
-        "cuda/include/nvToolsExtCudaRt.h",
-        "cuda/include/nvToolsExtMeta.h",
-        "cuda/include/nvToolsExtSync.h",
-        "cuda/include/nvblas.h",
-        "cuda/include/nvfunctional",
-        "cuda/include/nvgraph.h",
-        "cuda/include/nvjpeg.h",
-        "cuda/include/nvml.h",
-        "cuda/include/nvrtc.h",
-        "cuda/include/nvtx3/nvToolsExt.h",
-        "cuda/include/nvtx3/nvToolsExtCuda.h",
-        "cuda/include/nvtx3/nvToolsExtCudaRt.h",
-        "cuda/include/nvtx3/nvToolsExtOpenCL.h",
-        "cuda/include/nvtx3/nvToolsExtSync.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImpl.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplCore.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxInit.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxInitDecls.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxInitDefs.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxLinkOnce.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxTypes.h",
-        "cuda/include/sm_20_atomic_functions.h",
-        "cuda/include/sm_20_atomic_functions.hpp",
-        "cuda/include/sm_20_intrinsics.h",
-        "cuda/include/sm_20_intrinsics.hpp",
-        "cuda/include/sm_30_intrinsics.h",
-        "cuda/include/sm_30_intrinsics.hpp",
-        "cuda/include/sm_32_atomic_functions.h",
-        "cuda/include/sm_32_atomic_functions.hpp",
-        "cuda/include/sm_32_intrinsics.h",
-        "cuda/include/sm_32_intrinsics.hpp",
-        "cuda/include/sm_35_atomic_functions.h",
-        "cuda/include/sm_35_intrinsics.h",
-        "cuda/include/sm_60_atomic_functions.h",
-        "cuda/include/sm_60_atomic_functions.hpp",
-        "cuda/include/sm_61_intrinsics.h",
-        "cuda/include/sm_61_intrinsics.hpp",
-        "cuda/include/sobol_direction_vectors.h",
-        "cuda/include/surface_functions.h",
-        "cuda/include/surface_functions.hpp",
-        "cuda/include/surface_indirect_functions.h",
-        "cuda/include/surface_indirect_functions.hpp",
-        "cuda/include/surface_types.h",
-        "cuda/include/texture_fetch_functions.h",
-        "cuda/include/texture_fetch_functions.hpp",
-        "cuda/include/texture_indirect_functions.h",
-        "cuda/include/texture_indirect_functions.hpp",
-        "cuda/include/texture_types.h",
-        "cuda/include/thrust/adjacent_difference.h",
-        "cuda/include/thrust/advance.h",
-        "cuda/include/thrust/binary_search.h",
-        "cuda/include/thrust/complex.h",
-        "cuda/include/thrust/copy.h",
-        "cuda/include/thrust/count.h",
-        "cuda/include/thrust/detail/adjacent_difference.inl",
-        "cuda/include/thrust/detail/advance.inl",
-        "cuda/include/thrust/detail/alignment.h",
-        "cuda/include/thrust/detail/allocator/allocator_traits.h",
-        "cuda/include/thrust/detail/allocator/allocator_traits.inl",
-        "cuda/include/thrust/detail/allocator/copy_construct_range.h",
-        "cuda/include/thrust/detail/allocator/copy_construct_range.inl",
-        "cuda/include/thrust/detail/allocator/default_construct_range.h",
-        "cuda/include/thrust/detail/allocator/default_construct_range.inl",
-        "cuda/include/thrust/detail/allocator/destroy_range.h",
-        "cuda/include/thrust/detail/allocator/destroy_range.inl",
-        "cuda/include/thrust/detail/allocator/fill_construct_range.h",
-        "cuda/include/thrust/detail/allocator/fill_construct_range.inl",
-        "cuda/include/thrust/detail/allocator/malloc_allocator.h",
-        "cuda/include/thrust/detail/allocator/malloc_allocator.inl",
-        "cuda/include/thrust/detail/allocator/no_throw_allocator.h",
-        "cuda/include/thrust/detail/allocator/tagged_allocator.h",
-        "cuda/include/thrust/detail/allocator/tagged_allocator.inl",
-        "cuda/include/thrust/detail/allocator/temporary_allocator.h",
-        "cuda/include/thrust/detail/allocator/temporary_allocator.inl",
-        "cuda/include/thrust/detail/binary_search.inl",
-        "cuda/include/thrust/detail/complex/arithmetic.h",
-        "cuda/include/thrust/detail/complex/c99math.h",
-        "cuda/include/thrust/detail/complex/catrig.h",
-        "cuda/include/thrust/detail/complex/catrigf.h",
-        "cuda/include/thrust/detail/complex/ccosh.h",
-        "cuda/include/thrust/detail/complex/ccoshf.h",
-        "cuda/include/thrust/detail/complex/cexp.h",
-        "cuda/include/thrust/detail/complex/cexpf.h",
-        "cuda/include/thrust/detail/complex/clog.h",
-        "cuda/include/thrust/detail/complex/clogf.h",
-        "cuda/include/thrust/detail/complex/complex.inl",
-        "cuda/include/thrust/detail/complex/cpow.h",
-        "cuda/include/thrust/detail/complex/cproj.h",
-        "cuda/include/thrust/detail/complex/csinh.h",
-        "cuda/include/thrust/detail/complex/csinhf.h",
-        "cuda/include/thrust/detail/complex/csqrt.h",
-        "cuda/include/thrust/detail/complex/csqrtf.h",
-        "cuda/include/thrust/detail/complex/ctanh.h",
-        "cuda/include/thrust/detail/complex/ctanhf.h",
-        "cuda/include/thrust/detail/complex/math_private.h",
-        "cuda/include/thrust/detail/complex/stream.h",
-        "cuda/include/thrust/detail/config.h",
-        "cuda/include/thrust/detail/config/compiler.h",
-        "cuda/include/thrust/detail/config/compiler_fence.h",
-        "cuda/include/thrust/detail/config/config.h",
-        "cuda/include/thrust/detail/config/debug.h",
-        "cuda/include/thrust/detail/config/device_system.h",
-        "cuda/include/thrust/detail/config/exec_check_disable.h",
-        "cuda/include/thrust/detail/config/forceinline.h",
-        "cuda/include/thrust/detail/config/global_workarounds.h",
-        "cuda/include/thrust/detail/config/host_device.h",
-        "cuda/include/thrust/detail/config/host_system.h",
-        "cuda/include/thrust/detail/config/simple_defines.h",
-        "cuda/include/thrust/detail/contiguous_storage.h",
-        "cuda/include/thrust/detail/contiguous_storage.inl",
-        "cuda/include/thrust/detail/copy.h",
-        "cuda/include/thrust/detail/copy.inl",
-        "cuda/include/thrust/detail/copy_if.h",
-        "cuda/include/thrust/detail/copy_if.inl",
-        "cuda/include/thrust/detail/count.inl",
-        "cuda/include/thrust/detail/cstdint.h",
-        "cuda/include/thrust/detail/device_delete.inl",
-        "cuda/include/thrust/detail/device_free.inl",
-        "cuda/include/thrust/detail/device_malloc.inl",
-        "cuda/include/thrust/detail/device_new.inl",
-        "cuda/include/thrust/detail/device_ptr.inl",
-        "cuda/include/thrust/detail/device_reference.inl",
-        "cuda/include/thrust/detail/device_vector.inl",
-        "cuda/include/thrust/detail/dispatch/is_trivial_copy.h",
-        "cuda/include/thrust/detail/distance.inl",
-        "cuda/include/thrust/detail/equal.inl",
-        "cuda/include/thrust/detail/execute_with_allocator.h",
-        "cuda/include/thrust/detail/execution_policy.h",
-        "cuda/include/thrust/detail/extrema.inl",
-        "cuda/include/thrust/detail/fill.inl",
-        "cuda/include/thrust/detail/find.inl",
-        "cuda/include/thrust/detail/for_each.inl",
-        "cuda/include/thrust/detail/function.h",
-        "cuda/include/thrust/detail/functional.inl",
-        "cuda/include/thrust/detail/functional/actor.h",
-        "cuda/include/thrust/detail/functional/actor.inl",
-        "cuda/include/thrust/detail/functional/argument.h",
-        "cuda/include/thrust/detail/functional/composite.h",
-        "cuda/include/thrust/detail/functional/operators.h",
-        "cuda/include/thrust/detail/functional/operators/arithmetic_operators.h",
-        "cuda/include/thrust/detail/functional/operators/assignment_operator.h",
-        "cuda/include/thrust/detail/functional/operators/bitwise_operators.h",
-        "cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h",
-        "cuda/include/thrust/detail/functional/operators/logical_operators.h",
-        "cuda/include/thrust/detail/functional/operators/operator_adaptors.h",
-        "cuda/include/thrust/detail/functional/operators/relational_operators.h",
-        "cuda/include/thrust/detail/functional/placeholder.h",
-        "cuda/include/thrust/detail/functional/value.h",
-        "cuda/include/thrust/detail/gather.inl",
-        "cuda/include/thrust/detail/generate.inl",
-        "cuda/include/thrust/detail/get_iterator_value.h",
-        "cuda/include/thrust/detail/host_vector.inl",
-        "cuda/include/thrust/detail/inner_product.inl",
-        "cuda/include/thrust/detail/integer_math.h",
-        "cuda/include/thrust/detail/integer_traits.h",
-        "cuda/include/thrust/detail/internal_functional.h",
-        "cuda/include/thrust/detail/logical.inl",
-        "cuda/include/thrust/detail/malloc_and_free.h",
-        "cuda/include/thrust/detail/merge.inl",
-        "cuda/include/thrust/detail/minmax.h",
-        "cuda/include/thrust/detail/mismatch.inl",
-        "cuda/include/thrust/detail/mpl/math.h",
-        "cuda/include/thrust/detail/numeric_traits.h",
-        "cuda/include/thrust/detail/overlapped_copy.h",
-        "cuda/include/thrust/detail/pair.inl",
-        "cuda/include/thrust/detail/partition.inl",
-        "cuda/include/thrust/detail/pointer.h",
-        "cuda/include/thrust/detail/pointer.inl",
-        "cuda/include/thrust/detail/preprocessor.h",
-        "cuda/include/thrust/detail/range/head_flags.h",
-        "cuda/include/thrust/detail/range/tail_flags.h",
-        "cuda/include/thrust/detail/raw_pointer_cast.h",
-        "cuda/include/thrust/detail/raw_reference_cast.h",
-        "cuda/include/thrust/detail/reduce.inl",
-        "cuda/include/thrust/detail/reference.h",
-        "cuda/include/thrust/detail/reference.inl",
-        "cuda/include/thrust/detail/reference_forward_declaration.h",
-        "cuda/include/thrust/detail/remove.inl",
-        "cuda/include/thrust/detail/replace.inl",
-        "cuda/include/thrust/detail/reverse.inl",
-        "cuda/include/thrust/detail/scan.inl",
-        "cuda/include/thrust/detail/scatter.inl",
-        "cuda/include/thrust/detail/seq.h",
-        "cuda/include/thrust/detail/sequence.inl",
-        "cuda/include/thrust/detail/set_operations.inl",
-        "cuda/include/thrust/detail/sort.inl",
-        "cuda/include/thrust/detail/static_assert.h",
-        "cuda/include/thrust/detail/static_map.h",
-        "cuda/include/thrust/detail/swap.h",
-        "cuda/include/thrust/detail/swap.inl",
-        "cuda/include/thrust/detail/swap_ranges.inl",
-        "cuda/include/thrust/detail/tabulate.inl",
-        "cuda/include/thrust/detail/temporary_array.h",
-        "cuda/include/thrust/detail/temporary_array.inl",
-        "cuda/include/thrust/detail/temporary_buffer.h",
-        "cuda/include/thrust/detail/transform.inl",
-        "cuda/include/thrust/detail/transform_reduce.inl",
-        "cuda/include/thrust/detail/transform_scan.inl",
-        "cuda/include/thrust/detail/trivial_sequence.h",
-        "cuda/include/thrust/detail/tuple.inl",
-        "cuda/include/thrust/detail/tuple_meta_transform.h",
-        "cuda/include/thrust/detail/tuple_transform.h",
-        "cuda/include/thrust/detail/type_traits.h",
-        "cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h",
-        "cuda/include/thrust/detail/type_traits/function_traits.h",
-        "cuda/include/thrust/detail/type_traits/has_member_function.h",
-        "cuda/include/thrust/detail/type_traits/has_nested_type.h",
-        "cuda/include/thrust/detail/type_traits/has_trivial_assign.h",
-        "cuda/include/thrust/detail/type_traits/is_call_possible.h",
-        "cuda/include/thrust/detail/type_traits/is_metafunction_defined.h",
-        "cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h",
-        "cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h",
-        "cuda/include/thrust/detail/type_traits/minimum_type.h",
-        "cuda/include/thrust/detail/type_traits/pointer_traits.h",
-        "cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h",
-        "cuda/include/thrust/detail/uninitialized_copy.inl",
-        "cuda/include/thrust/detail/uninitialized_fill.inl",
-        "cuda/include/thrust/detail/unique.inl",
-        "cuda/include/thrust/detail/use_default.h",
-        "cuda/include/thrust/detail/util/align.h",
-        "cuda/include/thrust/detail/util/blocking.h",
-        "cuda/include/thrust/detail/vector_base.h",
-        "cuda/include/thrust/detail/vector_base.inl",
-        "cuda/include/thrust/device_allocator.h",
-        "cuda/include/thrust/device_delete.h",
-        "cuda/include/thrust/device_free.h",
-        "cuda/include/thrust/device_malloc.h",
-        "cuda/include/thrust/device_malloc_allocator.h",
-        "cuda/include/thrust/device_new.h",
-        "cuda/include/thrust/device_new_allocator.h",
-        "cuda/include/thrust/device_ptr.h",
-        "cuda/include/thrust/device_reference.h",
-        "cuda/include/thrust/device_vector.h",
-        "cuda/include/thrust/distance.h",
-        "cuda/include/thrust/equal.h",
-        "cuda/include/thrust/execution_policy.h",
-        "cuda/include/thrust/extrema.h",
-        "cuda/include/thrust/fill.h",
-        "cuda/include/thrust/find.h",
-        "cuda/include/thrust/for_each.h",
-        "cuda/include/thrust/functional.h",
-        "cuda/include/thrust/gather.h",
-        "cuda/include/thrust/generate.h",
-        "cuda/include/thrust/host_vector.h",
-        "cuda/include/thrust/inner_product.h",
-        "cuda/include/thrust/iterator/constant_iterator.h",
-        "cuda/include/thrust/iterator/counting_iterator.h",
-        "cuda/include/thrust/iterator/detail/any_assign.h",
-        "cuda/include/thrust/iterator/detail/any_system_tag.h",
-        "cuda/include/thrust/iterator/detail/constant_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/counting_iterator.inl",
-        "cuda/include/thrust/iterator/detail/device_system_tag.h",
-        "cuda/include/thrust/iterator/detail/discard_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/distance_from_result.h",
-        "cuda/include/thrust/iterator/detail/host_system_tag.h",
-        "cuda/include/thrust/iterator/detail/is_iterator_category.h",
-        "cuda/include/thrust/iterator/detail/is_trivial_iterator.h",
-        "cuda/include/thrust/iterator/detail/iterator_adaptor_base.h",
-        "cuda/include/thrust/iterator/detail/iterator_category_to_system.h",
-        "cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h",
-        "cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h",
-        "cuda/include/thrust/iterator/detail/iterator_facade_category.h",
-        "cuda/include/thrust/iterator/detail/iterator_traits.inl",
-        "cuda/include/thrust/iterator/detail/iterator_traversal_tags.h",
-        "cuda/include/thrust/iterator/detail/join_iterator.h",
-        "cuda/include/thrust/iterator/detail/minimum_category.h",
-        "cuda/include/thrust/iterator/detail/minimum_system.h",
-        "cuda/include/thrust/iterator/detail/normal_iterator.h",
-        "cuda/include/thrust/iterator/detail/permutation_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/retag.h",
-        "cuda/include/thrust/iterator/detail/reverse_iterator.inl",
-        "cuda/include/thrust/iterator/detail/reverse_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/tagged_iterator.h",
-        "cuda/include/thrust/iterator/detail/transform_iterator.inl",
-        "cuda/include/thrust/iterator/detail/transform_output_iterator.inl",
-        "cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h",
-        "cuda/include/thrust/iterator/detail/universal_categories.h",
-        "cuda/include/thrust/iterator/detail/zip_iterator.inl",
-        "cuda/include/thrust/iterator/detail/zip_iterator_base.h",
-        "cuda/include/thrust/iterator/discard_iterator.h",
-        "cuda/include/thrust/iterator/iterator_adaptor.h",
-        "cuda/include/thrust/iterator/iterator_categories.h",
-        "cuda/include/thrust/iterator/iterator_facade.h",
-        "cuda/include/thrust/iterator/iterator_traits.h",
-        "cuda/include/thrust/iterator/permutation_iterator.h",
-        "cuda/include/thrust/iterator/retag.h",
-        "cuda/include/thrust/iterator/reverse_iterator.h",
-        "cuda/include/thrust/iterator/transform_iterator.h",
-        "cuda/include/thrust/iterator/transform_output_iterator.h",
-        "cuda/include/thrust/iterator/zip_iterator.h",
-        "cuda/include/thrust/logical.h",
-        "cuda/include/thrust/memory.h",
-        "cuda/include/thrust/merge.h",
-        "cuda/include/thrust/mismatch.h",
-        "cuda/include/thrust/pair.h",
-        "cuda/include/thrust/partition.h",
-        "cuda/include/thrust/random.h",
-        "cuda/include/thrust/random/detail/discard_block_engine.inl",
-        "cuda/include/thrust/random/detail/linear_congruential_engine.inl",
-        "cuda/include/thrust/random/detail/linear_congruential_engine_discard.h",
-        "cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl",
-        "cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h",
-        "cuda/include/thrust/random/detail/mod.h",
-        "cuda/include/thrust/random/detail/normal_distribution.inl",
-        "cuda/include/thrust/random/detail/normal_distribution_base.h",
-        "cuda/include/thrust/random/detail/random_core_access.h",
-        "cuda/include/thrust/random/detail/subtract_with_carry_engine.inl",
-        "cuda/include/thrust/random/detail/uniform_int_distribution.inl",
-        "cuda/include/thrust/random/detail/uniform_real_distribution.inl",
-        "cuda/include/thrust/random/detail/xor_combine_engine.inl",
-        "cuda/include/thrust/random/detail/xor_combine_engine_max.h",
-        "cuda/include/thrust/random/discard_block_engine.h",
-        "cuda/include/thrust/random/linear_congruential_engine.h",
-        "cuda/include/thrust/random/linear_feedback_shift_engine.h",
-        "cuda/include/thrust/random/normal_distribution.h",
-        "cuda/include/thrust/random/subtract_with_carry_engine.h",
-        "cuda/include/thrust/random/uniform_int_distribution.h",
-        "cuda/include/thrust/random/uniform_real_distribution.h",
-        "cuda/include/thrust/random/xor_combine_engine.h",
-        "cuda/include/thrust/reduce.h",
-        "cuda/include/thrust/remove.h",
-        "cuda/include/thrust/replace.h",
-        "cuda/include/thrust/reverse.h",
-        "cuda/include/thrust/scan.h",
-        "cuda/include/thrust/scatter.h",
-        "cuda/include/thrust/sequence.h",
-        "cuda/include/thrust/set_operations.h",
-        "cuda/include/thrust/sort.h",
-        "cuda/include/thrust/swap.h",
-        "cuda/include/thrust/system/cpp/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/cpp/detail/assign_value.h",
-        "cuda/include/thrust/system/cpp/detail/binary_search.h",
-        "cuda/include/thrust/system/cpp/detail/copy.h",
-        "cuda/include/thrust/system/cpp/detail/copy_if.h",
-        "cuda/include/thrust/system/cpp/detail/count.h",
-        "cuda/include/thrust/system/cpp/detail/equal.h",
-        "cuda/include/thrust/system/cpp/detail/execution_policy.h",
-        "cuda/include/thrust/system/cpp/detail/extrema.h",
-        "cuda/include/thrust/system/cpp/detail/fill.h",
-        "cuda/include/thrust/system/cpp/detail/find.h",
-        "cuda/include/thrust/system/cpp/detail/for_each.h",
-        "cuda/include/thrust/system/cpp/detail/gather.h",
-        "cuda/include/thrust/system/cpp/detail/generate.h",
-        "cuda/include/thrust/system/cpp/detail/get_value.h",
-        "cuda/include/thrust/system/cpp/detail/inner_product.h",
-        "cuda/include/thrust/system/cpp/detail/iter_swap.h",
-        "cuda/include/thrust/system/cpp/detail/logical.h",
-        "cuda/include/thrust/system/cpp/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/cpp/detail/memory.inl",
-        "cuda/include/thrust/system/cpp/detail/merge.h",
-        "cuda/include/thrust/system/cpp/detail/mismatch.h",
-        "cuda/include/thrust/system/cpp/detail/par.h",
-        "cuda/include/thrust/system/cpp/detail/partition.h",
-        "cuda/include/thrust/system/cpp/detail/reduce.h",
-        "cuda/include/thrust/system/cpp/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/cpp/detail/remove.h",
-        "cuda/include/thrust/system/cpp/detail/replace.h",
-        "cuda/include/thrust/system/cpp/detail/reverse.h",
-        "cuda/include/thrust/system/cpp/detail/scan.h",
-        "cuda/include/thrust/system/cpp/detail/scan_by_key.h",
-        "cuda/include/thrust/system/cpp/detail/scatter.h",
-        "cuda/include/thrust/system/cpp/detail/sequence.h",
-        "cuda/include/thrust/system/cpp/detail/set_operations.h",
-        "cuda/include/thrust/system/cpp/detail/sort.h",
-        "cuda/include/thrust/system/cpp/detail/swap_ranges.h",
-        "cuda/include/thrust/system/cpp/detail/tabulate.h",
-        "cuda/include/thrust/system/cpp/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/cpp/detail/transform.h",
-        "cuda/include/thrust/system/cpp/detail/transform_reduce.h",
-        "cuda/include/thrust/system/cpp/detail/transform_scan.h",
-        "cuda/include/thrust/system/cpp/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/cpp/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/cpp/detail/unique.h",
-        "cuda/include/thrust/system/cpp/detail/unique_by_key.h",
-        "cuda/include/thrust/system/cpp/detail/vector.inl",
-        "cuda/include/thrust/system/cpp/execution_policy.h",
-        "cuda/include/thrust/system/cpp/memory.h",
-        "cuda/include/thrust/system/cpp/vector.h",
-        "cuda/include/thrust/system/cuda/config.h",
-        "cuda/include/thrust/system/cuda/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/cuda/detail/assign_value.h",
-        "cuda/include/thrust/system/cuda/detail/binary_search.h",
-        "cuda/include/thrust/system/cuda/detail/copy.h",
-        "cuda/include/thrust/system/cuda/detail/copy_if.h",
-        "cuda/include/thrust/system/cuda/detail/core/agent_launcher.h",
-        "cuda/include/thrust/system/cuda/detail/core/alignment.h",
-        "cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h",
-        "cuda/include/thrust/system/cuda/detail/core/util.h",
-        "cuda/include/thrust/system/cuda/detail/count.h",
-        "cuda/include/thrust/system/cuda/detail/cross_system.h",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/cub.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_device.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_type.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/equal.h",
-        "cuda/include/thrust/system/cuda/detail/error.inl",
-        "cuda/include/thrust/system/cuda/detail/execution_policy.h",
-        "cuda/include/thrust/system/cuda/detail/extrema.h",
-        "cuda/include/thrust/system/cuda/detail/fill.h",
-        "cuda/include/thrust/system/cuda/detail/find.h",
-        "cuda/include/thrust/system/cuda/detail/for_each.h",
-        "cuda/include/thrust/system/cuda/detail/gather.h",
-        "cuda/include/thrust/system/cuda/detail/generate.h",
-        "cuda/include/thrust/system/cuda/detail/get_value.h",
-        "cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h",
-        "cuda/include/thrust/system/cuda/detail/guarded_driver_types.h",
-        "cuda/include/thrust/system/cuda/detail/inner_product.h",
-        "cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h",
-        "cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h",
-        "cuda/include/thrust/system/cuda/detail/iter_swap.h",
-        "cuda/include/thrust/system/cuda/detail/logical.h",
-        "cuda/include/thrust/system/cuda/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/cuda/detail/memory.inl",
-        "cuda/include/thrust/system/cuda/detail/merge.h",
-        "cuda/include/thrust/system/cuda/detail/mismatch.h",
-        "cuda/include/thrust/system/cuda/detail/par.h",
-        "cuda/include/thrust/system/cuda/detail/par_to_seq.h",
-        "cuda/include/thrust/system/cuda/detail/parallel_for.h",
-        "cuda/include/thrust/system/cuda/detail/partition.h",
-        "cuda/include/thrust/system/cuda/detail/reduce.h",
-        "cuda/include/thrust/system/cuda/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/cuda/detail/remove.h",
-        "cuda/include/thrust/system/cuda/detail/replace.h",
-        "cuda/include/thrust/system/cuda/detail/reverse.h",
-        "cuda/include/thrust/system/cuda/detail/scan.h",
-        "cuda/include/thrust/system/cuda/detail/scan_by_key.h",
-        "cuda/include/thrust/system/cuda/detail/scatter.h",
-        "cuda/include/thrust/system/cuda/detail/sequence.h",
-        "cuda/include/thrust/system/cuda/detail/set_operations.h",
-        "cuda/include/thrust/system/cuda/detail/sort.h",
-        "cuda/include/thrust/system/cuda/detail/swap_ranges.h",
-        "cuda/include/thrust/system/cuda/detail/tabulate.h",
-        "cuda/include/thrust/system/cuda/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/cuda/detail/terminate.h",
-        "cuda/include/thrust/system/cuda/detail/transform.h",
-        "cuda/include/thrust/system/cuda/detail/transform_reduce.h",
-        "cuda/include/thrust/system/cuda/detail/transform_scan.h",
-        "cuda/include/thrust/system/cuda/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/cuda/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/cuda/detail/unique.h",
-        "cuda/include/thrust/system/cuda/detail/unique_by_key.h",
-        "cuda/include/thrust/system/cuda/detail/util.h",
-        "cuda/include/thrust/system/cuda/detail/vector.inl",
-        "cuda/include/thrust/system/cuda/error.h",
-        "cuda/include/thrust/system/cuda/execution_policy.h",
-        "cuda/include/thrust/system/cuda/experimental/pinned_allocator.h",
-        "cuda/include/thrust/system/cuda/memory.h",
-        "cuda/include/thrust/system/cuda/vector.h",
-        "cuda/include/thrust/system/detail/adl/adjacent_difference.h",
-        "cuda/include/thrust/system/detail/adl/assign_value.h",
-        "cuda/include/thrust/system/detail/adl/binary_search.h",
-        "cuda/include/thrust/system/detail/adl/copy.h",
-        "cuda/include/thrust/system/detail/adl/copy_if.h",
-        "cuda/include/thrust/system/detail/adl/count.h",
-        "cuda/include/thrust/system/detail/adl/equal.h",
-        "cuda/include/thrust/system/detail/adl/extrema.h",
-        "cuda/include/thrust/system/detail/adl/fill.h",
-        "cuda/include/thrust/system/detail/adl/find.h",
-        "cuda/include/thrust/system/detail/adl/for_each.h",
-        "cuda/include/thrust/system/detail/adl/gather.h",
-        "cuda/include/thrust/system/detail/adl/generate.h",
-        "cuda/include/thrust/system/detail/adl/get_value.h",
-        "cuda/include/thrust/system/detail/adl/inner_product.h",
-        "cuda/include/thrust/system/detail/adl/iter_swap.h",
-        "cuda/include/thrust/system/detail/adl/logical.h",
-        "cuda/include/thrust/system/detail/adl/malloc_and_free.h",
-        "cuda/include/thrust/system/detail/adl/merge.h",
-        "cuda/include/thrust/system/detail/adl/mismatch.h",
-        "cuda/include/thrust/system/detail/adl/partition.h",
-        "cuda/include/thrust/system/detail/adl/reduce.h",
-        "cuda/include/thrust/system/detail/adl/reduce_by_key.h",
-        "cuda/include/thrust/system/detail/adl/remove.h",
-        "cuda/include/thrust/system/detail/adl/replace.h",
-        "cuda/include/thrust/system/detail/adl/reverse.h",
-        "cuda/include/thrust/system/detail/adl/scan.h",
-        "cuda/include/thrust/system/detail/adl/scan_by_key.h",
-        "cuda/include/thrust/system/detail/adl/scatter.h",
-        "cuda/include/thrust/system/detail/adl/sequence.h",
-        "cuda/include/thrust/system/detail/adl/set_operations.h",
-        "cuda/include/thrust/system/detail/adl/sort.h",
-        "cuda/include/thrust/system/detail/adl/swap_ranges.h",
-        "cuda/include/thrust/system/detail/adl/tabulate.h",
-        "cuda/include/thrust/system/detail/adl/temporary_buffer.h",
-        "cuda/include/thrust/system/detail/adl/transform.h",
-        "cuda/include/thrust/system/detail/adl/transform_reduce.h",
-        "cuda/include/thrust/system/detail/adl/transform_scan.h",
-        "cuda/include/thrust/system/detail/adl/uninitialized_copy.h",
-        "cuda/include/thrust/system/detail/adl/uninitialized_fill.h",
-        "cuda/include/thrust/system/detail/adl/unique.h",
-        "cuda/include/thrust/system/detail/adl/unique_by_key.h",
-        "cuda/include/thrust/system/detail/bad_alloc.h",
-        "cuda/include/thrust/system/detail/errno.h",
-        "cuda/include/thrust/system/detail/error_category.inl",
-        "cuda/include/thrust/system/detail/error_code.inl",
-        "cuda/include/thrust/system/detail/error_condition.inl",
-        "cuda/include/thrust/system/detail/generic/adjacent_difference.h",
-        "cuda/include/thrust/system/detail/generic/adjacent_difference.inl",
-        "cuda/include/thrust/system/detail/generic/advance.h",
-        "cuda/include/thrust/system/detail/generic/advance.inl",
-        "cuda/include/thrust/system/detail/generic/binary_search.h",
-        "cuda/include/thrust/system/detail/generic/binary_search.inl",
-        "cuda/include/thrust/system/detail/generic/copy.h",
-        "cuda/include/thrust/system/detail/generic/copy.inl",
-        "cuda/include/thrust/system/detail/generic/copy_if.h",
-        "cuda/include/thrust/system/detail/generic/copy_if.inl",
-        "cuda/include/thrust/system/detail/generic/count.h",
-        "cuda/include/thrust/system/detail/generic/count.inl",
-        "cuda/include/thrust/system/detail/generic/distance.h",
-        "cuda/include/thrust/system/detail/generic/distance.inl",
-        "cuda/include/thrust/system/detail/generic/equal.h",
-        "cuda/include/thrust/system/detail/generic/equal.inl",
-        "cuda/include/thrust/system/detail/generic/extrema.h",
-        "cuda/include/thrust/system/detail/generic/extrema.inl",
-        "cuda/include/thrust/system/detail/generic/fill.h",
-        "cuda/include/thrust/system/detail/generic/find.h",
-        "cuda/include/thrust/system/detail/generic/find.inl",
-        "cuda/include/thrust/system/detail/generic/for_each.h",
-        "cuda/include/thrust/system/detail/generic/gather.h",
-        "cuda/include/thrust/system/detail/generic/gather.inl",
-        "cuda/include/thrust/system/detail/generic/generate.h",
-        "cuda/include/thrust/system/detail/generic/generate.inl",
-        "cuda/include/thrust/system/detail/generic/inner_product.h",
-        "cuda/include/thrust/system/detail/generic/inner_product.inl",
-        "cuda/include/thrust/system/detail/generic/logical.h",
-        "cuda/include/thrust/system/detail/generic/memory.h",
-        "cuda/include/thrust/system/detail/generic/memory.inl",
-        "cuda/include/thrust/system/detail/generic/merge.h",
-        "cuda/include/thrust/system/detail/generic/merge.inl",
-        "cuda/include/thrust/system/detail/generic/mismatch.h",
-        "cuda/include/thrust/system/detail/generic/mismatch.inl",
-        "cuda/include/thrust/system/detail/generic/partition.h",
-        "cuda/include/thrust/system/detail/generic/partition.inl",
-        "cuda/include/thrust/system/detail/generic/reduce.h",
-        "cuda/include/thrust/system/detail/generic/reduce.inl",
-        "cuda/include/thrust/system/detail/generic/reduce_by_key.h",
-        "cuda/include/thrust/system/detail/generic/reduce_by_key.inl",
-        "cuda/include/thrust/system/detail/generic/remove.h",
-        "cuda/include/thrust/system/detail/generic/remove.inl",
-        "cuda/include/thrust/system/detail/generic/replace.h",
-        "cuda/include/thrust/system/detail/generic/replace.inl",
-        "cuda/include/thrust/system/detail/generic/reverse.h",
-        "cuda/include/thrust/system/detail/generic/reverse.inl",
-        "cuda/include/thrust/system/detail/generic/scalar/binary_search.h",
-        "cuda/include/thrust/system/detail/generic/scalar/binary_search.inl",
-        "cuda/include/thrust/system/detail/generic/scan.h",
-        "cuda/include/thrust/system/detail/generic/scan.inl",
-        "cuda/include/thrust/system/detail/generic/scan_by_key.h",
-        "cuda/include/thrust/system/detail/generic/scan_by_key.inl",
-        "cuda/include/thrust/system/detail/generic/scatter.h",
-        "cuda/include/thrust/system/detail/generic/scatter.inl",
-        "cuda/include/thrust/system/detail/generic/select_system.h",
-        "cuda/include/thrust/system/detail/generic/sequence.h",
-        "cuda/include/thrust/system/detail/generic/sequence.inl",
-        "cuda/include/thrust/system/detail/generic/set_operations.h",
-        "cuda/include/thrust/system/detail/generic/set_operations.inl",
-        "cuda/include/thrust/system/detail/generic/sort.h",
-        "cuda/include/thrust/system/detail/generic/sort.inl",
-        "cuda/include/thrust/system/detail/generic/swap_ranges.h",
-        "cuda/include/thrust/system/detail/generic/swap_ranges.inl",
-        "cuda/include/thrust/system/detail/generic/tabulate.h",
-        "cuda/include/thrust/system/detail/generic/tabulate.inl",
-        "cuda/include/thrust/system/detail/generic/tag.h",
-        "cuda/include/thrust/system/detail/generic/temporary_buffer.h",
-        "cuda/include/thrust/system/detail/generic/temporary_buffer.inl",
-        "cuda/include/thrust/system/detail/generic/transform.h",
-        "cuda/include/thrust/system/detail/generic/transform.inl",
-        "cuda/include/thrust/system/detail/generic/transform_reduce.h",
-        "cuda/include/thrust/system/detail/generic/transform_reduce.inl",
-        "cuda/include/thrust/system/detail/generic/transform_scan.h",
-        "cuda/include/thrust/system/detail/generic/transform_scan.inl",
-        "cuda/include/thrust/system/detail/generic/type_traits.h",
-        "cuda/include/thrust/system/detail/generic/uninitialized_copy.h",
-        "cuda/include/thrust/system/detail/generic/uninitialized_copy.inl",
-        "cuda/include/thrust/system/detail/generic/uninitialized_fill.h",
-        "cuda/include/thrust/system/detail/generic/uninitialized_fill.inl",
-        "cuda/include/thrust/system/detail/generic/unique.h",
-        "cuda/include/thrust/system/detail/generic/unique.inl",
-        "cuda/include/thrust/system/detail/generic/unique_by_key.h",
-        "cuda/include/thrust/system/detail/generic/unique_by_key.inl",
-        "cuda/include/thrust/system/detail/internal/decompose.h",
-        "cuda/include/thrust/system/detail/sequential/adjacent_difference.h",
-        "cuda/include/thrust/system/detail/sequential/assign_value.h",
-        "cuda/include/thrust/system/detail/sequential/binary_search.h",
-        "cuda/include/thrust/system/detail/sequential/copy.h",
-        "cuda/include/thrust/system/detail/sequential/copy.inl",
-        "cuda/include/thrust/system/detail/sequential/copy_backward.h",
-        "cuda/include/thrust/system/detail/sequential/copy_if.h",
-        "cuda/include/thrust/system/detail/sequential/count.h",
-        "cuda/include/thrust/system/detail/sequential/equal.h",
-        "cuda/include/thrust/system/detail/sequential/execution_policy.h",
-        "cuda/include/thrust/system/detail/sequential/extrema.h",
-        "cuda/include/thrust/system/detail/sequential/fill.h",
-        "cuda/include/thrust/system/detail/sequential/find.h",
-        "cuda/include/thrust/system/detail/sequential/for_each.h",
-        "cuda/include/thrust/system/detail/sequential/gather.h",
-        "cuda/include/thrust/system/detail/sequential/general_copy.h",
-        "cuda/include/thrust/system/detail/sequential/generate.h",
-        "cuda/include/thrust/system/detail/sequential/get_value.h",
-        "cuda/include/thrust/system/detail/sequential/inner_product.h",
-        "cuda/include/thrust/system/detail/sequential/insertion_sort.h",
-        "cuda/include/thrust/system/detail/sequential/iter_swap.h",
-        "cuda/include/thrust/system/detail/sequential/logical.h",
-        "cuda/include/thrust/system/detail/sequential/malloc_and_free.h",
-        "cuda/include/thrust/system/detail/sequential/merge.h",
-        "cuda/include/thrust/system/detail/sequential/merge.inl",
-        "cuda/include/thrust/system/detail/sequential/mismatch.h",
-        "cuda/include/thrust/system/detail/sequential/partition.h",
-        "cuda/include/thrust/system/detail/sequential/reduce.h",
-        "cuda/include/thrust/system/detail/sequential/reduce_by_key.h",
-        "cuda/include/thrust/system/detail/sequential/remove.h",
-        "cuda/include/thrust/system/detail/sequential/replace.h",
-        "cuda/include/thrust/system/detail/sequential/reverse.h",
-        "cuda/include/thrust/system/detail/sequential/scan.h",
-        "cuda/include/thrust/system/detail/sequential/scan_by_key.h",
-        "cuda/include/thrust/system/detail/sequential/scatter.h",
-        "cuda/include/thrust/system/detail/sequential/sequence.h",
-        "cuda/include/thrust/system/detail/sequential/set_operations.h",
-        "cuda/include/thrust/system/detail/sequential/sort.h",
-        "cuda/include/thrust/system/detail/sequential/sort.inl",
-        "cuda/include/thrust/system/detail/sequential/stable_merge_sort.h",
-        "cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl",
-        "cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h",
-        "cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl",
-        "cuda/include/thrust/system/detail/sequential/stable_radix_sort.h",
-        "cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl",
-        "cuda/include/thrust/system/detail/sequential/swap_ranges.h",
-        "cuda/include/thrust/system/detail/sequential/tabulate.h",
-        "cuda/include/thrust/system/detail/sequential/temporary_buffer.h",
-        "cuda/include/thrust/system/detail/sequential/transform.h",
-        "cuda/include/thrust/system/detail/sequential/transform_reduce.h",
-        "cuda/include/thrust/system/detail/sequential/transform_scan.h",
-        "cuda/include/thrust/system/detail/sequential/trivial_copy.h",
-        "cuda/include/thrust/system/detail/sequential/uninitialized_copy.h",
-        "cuda/include/thrust/system/detail/sequential/uninitialized_fill.h",
-        "cuda/include/thrust/system/detail/sequential/unique.h",
-        "cuda/include/thrust/system/detail/sequential/unique_by_key.h",
-        "cuda/include/thrust/system/detail/system_error.inl",
-        "cuda/include/thrust/system/error_code.h",
-        "cuda/include/thrust/system/omp/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/omp/detail/assign_value.h",
-        "cuda/include/thrust/system/omp/detail/binary_search.h",
-        "cuda/include/thrust/system/omp/detail/copy.h",
-        "cuda/include/thrust/system/omp/detail/copy.inl",
-        "cuda/include/thrust/system/omp/detail/copy_if.h",
-        "cuda/include/thrust/system/omp/detail/copy_if.inl",
-        "cuda/include/thrust/system/omp/detail/count.h",
-        "cuda/include/thrust/system/omp/detail/default_decomposition.h",
-        "cuda/include/thrust/system/omp/detail/default_decomposition.inl",
-        "cuda/include/thrust/system/omp/detail/equal.h",
-        "cuda/include/thrust/system/omp/detail/execution_policy.h",
-        "cuda/include/thrust/system/omp/detail/extrema.h",
-        "cuda/include/thrust/system/omp/detail/fill.h",
-        "cuda/include/thrust/system/omp/detail/find.h",
-        "cuda/include/thrust/system/omp/detail/for_each.h",
-        "cuda/include/thrust/system/omp/detail/for_each.inl",
-        "cuda/include/thrust/system/omp/detail/gather.h",
-        "cuda/include/thrust/system/omp/detail/generate.h",
-        "cuda/include/thrust/system/omp/detail/get_value.h",
-        "cuda/include/thrust/system/omp/detail/inner_product.h",
-        "cuda/include/thrust/system/omp/detail/iter_swap.h",
-        "cuda/include/thrust/system/omp/detail/logical.h",
-        "cuda/include/thrust/system/omp/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/omp/detail/memory.inl",
-        "cuda/include/thrust/system/omp/detail/merge.h",
-        "cuda/include/thrust/system/omp/detail/mismatch.h",
-        "cuda/include/thrust/system/omp/detail/par.h",
-        "cuda/include/thrust/system/omp/detail/partition.h",
-        "cuda/include/thrust/system/omp/detail/partition.inl",
-        "cuda/include/thrust/system/omp/detail/reduce.h",
-        "cuda/include/thrust/system/omp/detail/reduce.inl",
-        "cuda/include/thrust/system/omp/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/omp/detail/reduce_by_key.inl",
-        "cuda/include/thrust/system/omp/detail/reduce_intervals.h",
-        "cuda/include/thrust/system/omp/detail/reduce_intervals.inl",
-        "cuda/include/thrust/system/omp/detail/remove.h",
-        "cuda/include/thrust/system/omp/detail/remove.inl",
-        "cuda/include/thrust/system/omp/detail/replace.h",
-        "cuda/include/thrust/system/omp/detail/reverse.h",
-        "cuda/include/thrust/system/omp/detail/scan.h",
-        "cuda/include/thrust/system/omp/detail/scan_by_key.h",
-        "cuda/include/thrust/system/omp/detail/scatter.h",
-        "cuda/include/thrust/system/omp/detail/sequence.h",
-        "cuda/include/thrust/system/omp/detail/set_operations.h",
-        "cuda/include/thrust/system/omp/detail/sort.h",
-        "cuda/include/thrust/system/omp/detail/sort.inl",
-        "cuda/include/thrust/system/omp/detail/swap_ranges.h",
-        "cuda/include/thrust/system/omp/detail/tabulate.h",
-        "cuda/include/thrust/system/omp/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/omp/detail/transform.h",
-        "cuda/include/thrust/system/omp/detail/transform_reduce.h",
-        "cuda/include/thrust/system/omp/detail/transform_scan.h",
-        "cuda/include/thrust/system/omp/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/omp/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/omp/detail/unique.h",
-        "cuda/include/thrust/system/omp/detail/unique.inl",
-        "cuda/include/thrust/system/omp/detail/unique_by_key.h",
-        "cuda/include/thrust/system/omp/detail/unique_by_key.inl",
-        "cuda/include/thrust/system/omp/detail/vector.inl",
-        "cuda/include/thrust/system/omp/execution_policy.h",
-        "cuda/include/thrust/system/omp/memory.h",
-        "cuda/include/thrust/system/omp/vector.h",
-        "cuda/include/thrust/system/system_error.h",
-        "cuda/include/thrust/system/tbb/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/tbb/detail/assign_value.h",
-        "cuda/include/thrust/system/tbb/detail/binary_search.h",
-        "cuda/include/thrust/system/tbb/detail/copy.h",
-        "cuda/include/thrust/system/tbb/detail/copy.inl",
-        "cuda/include/thrust/system/tbb/detail/copy_if.h",
-        "cuda/include/thrust/system/tbb/detail/copy_if.inl",
-        "cuda/include/thrust/system/tbb/detail/count.h",
-        "cuda/include/thrust/system/tbb/detail/equal.h",
-        "cuda/include/thrust/system/tbb/detail/execution_policy.h",
-        "cuda/include/thrust/system/tbb/detail/extrema.h",
-        "cuda/include/thrust/system/tbb/detail/fill.h",
-        "cuda/include/thrust/system/tbb/detail/find.h",
-        "cuda/include/thrust/system/tbb/detail/for_each.h",
-        "cuda/include/thrust/system/tbb/detail/for_each.inl",
-        "cuda/include/thrust/system/tbb/detail/gather.h",
-        "cuda/include/thrust/system/tbb/detail/generate.h",
-        "cuda/include/thrust/system/tbb/detail/get_value.h",
-        "cuda/include/thrust/system/tbb/detail/inner_product.h",
-        "cuda/include/thrust/system/tbb/detail/iter_swap.h",
-        "cuda/include/thrust/system/tbb/detail/logical.h",
-        "cuda/include/thrust/system/tbb/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/tbb/detail/memory.inl",
-        "cuda/include/thrust/system/tbb/detail/merge.h",
-        "cuda/include/thrust/system/tbb/detail/merge.inl",
-        "cuda/include/thrust/system/tbb/detail/mismatch.h",
-        "cuda/include/thrust/system/tbb/detail/par.h",
-        "cuda/include/thrust/system/tbb/detail/partition.h",
-        "cuda/include/thrust/system/tbb/detail/partition.inl",
-        "cuda/include/thrust/system/tbb/detail/reduce.h",
-        "cuda/include/thrust/system/tbb/detail/reduce.inl",
-        "cuda/include/thrust/system/tbb/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/tbb/detail/reduce_by_key.inl",
-        "cuda/include/thrust/system/tbb/detail/reduce_intervals.h",
-        "cuda/include/thrust/system/tbb/detail/remove.h",
-        "cuda/include/thrust/system/tbb/detail/remove.inl",
-        "cuda/include/thrust/system/tbb/detail/replace.h",
-        "cuda/include/thrust/system/tbb/detail/reverse.h",
-        "cuda/include/thrust/system/tbb/detail/scan.h",
-        "cuda/include/thrust/system/tbb/detail/scan.inl",
-        "cuda/include/thrust/system/tbb/detail/scan_by_key.h",
-        "cuda/include/thrust/system/tbb/detail/scatter.h",
-        "cuda/include/thrust/system/tbb/detail/sequence.h",
-        "cuda/include/thrust/system/tbb/detail/set_operations.h",
-        "cuda/include/thrust/system/tbb/detail/sort.h",
-        "cuda/include/thrust/system/tbb/detail/sort.inl",
-        "cuda/include/thrust/system/tbb/detail/swap_ranges.h",
-        "cuda/include/thrust/system/tbb/detail/tabulate.h",
-        "cuda/include/thrust/system/tbb/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/tbb/detail/transform.h",
-        "cuda/include/thrust/system/tbb/detail/transform_reduce.h",
-        "cuda/include/thrust/system/tbb/detail/transform_scan.h",
-        "cuda/include/thrust/system/tbb/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/tbb/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/tbb/detail/unique.h",
-        "cuda/include/thrust/system/tbb/detail/unique.inl",
-        "cuda/include/thrust/system/tbb/detail/unique_by_key.h",
-        "cuda/include/thrust/system/tbb/detail/unique_by_key.inl",
-        "cuda/include/thrust/system/tbb/detail/vector.inl",
-        "cuda/include/thrust/system/tbb/execution_policy.h",
-        "cuda/include/thrust/system/tbb/memory.h",
-        "cuda/include/thrust/system/tbb/vector.h",
-        "cuda/include/thrust/system_error.h",
-        "cuda/include/thrust/tabulate.h",
-        "cuda/include/thrust/transform.h",
-        "cuda/include/thrust/transform_reduce.h",
-        "cuda/include/thrust/transform_scan.h",
-        "cuda/include/thrust/tuple.h",
-        "cuda/include/thrust/uninitialized_copy.h",
-        "cuda/include/thrust/uninitialized_fill.h",
-        "cuda/include/thrust/unique.h",
-        "cuda/include/thrust/version.h",
-        "cuda/include/vector_functions.h",
-        "cuda/include/vector_functions.hpp",
-        "cuda/include/vector_types.h",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.0/include/." "$(@D)/cuda/include/" """,
-)
-
-genrule(
-    name = "cuda-nvvm",
-    outs = [
-        "cuda/nvvm/libdevice/libdevice.10.bc",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.0/nvvm/libdevice/." "$(@D)/" """,
-)
-
-genrule(
-    name = "cuda-extras",
-    outs = [
-        "cuda/extras/CUPTI/include/GL/gl.h",
-        "cuda/extras/CUPTI/include/GL/glew.h",
-        "cuda/extras/CUPTI/include/GL/glext.h",
-        "cuda/extras/CUPTI/include/GL/glu.h",
-        "cuda/extras/CUPTI/include/GL/glut.h",
-        "cuda/extras/CUPTI/include/GL/glx.h",
-        "cuda/extras/CUPTI/include/GL/glxext.h",
-        "cuda/extras/CUPTI/include/GL/wglew.h",
-        "cuda/extras/CUPTI/include/GL/wglext.h",
-        "cuda/extras/CUPTI/include/cuda_stdint.h",
-        "cuda/extras/CUPTI/include/cupti.h",
-        "cuda/extras/CUPTI/include/cupti_activity.h",
-        "cuda/extras/CUPTI/include/cupti_callbacks.h",
-        "cuda/extras/CUPTI/include/cupti_driver_cbid.h",
-        "cuda/extras/CUPTI/include/cupti_events.h",
-        "cuda/extras/CUPTI/include/cupti_metrics.h",
-        "cuda/extras/CUPTI/include/cupti_nvtx_cbid.h",
-        "cuda/extras/CUPTI/include/cupti_result.h",
-        "cuda/extras/CUPTI/include/cupti_runtime_cbid.h",
-        "cuda/extras/CUPTI/include/cupti_version.h",
-        "cuda/extras/CUPTI/include/generated_cudaGL_meta.h",
-        "cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h",
-        "cuda/extras/CUPTI/include/generated_nvtx_meta.h",
-        "cuda/extras/CUPTI/include/openacc/cupti_openacc.h",
-        "cuda/extras/CUPTI/include/openmp/cupti_openmp.h",
-        "cuda/extras/CUPTI/include/openmp/ompt.h",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.0/extras/CUPTI/include/." "$(@D)/cuda/extras/CUPTI/include/" """,
-)
-
-genrule(
-    name = "cublas-include",
-    outs = [
-        "cublas/include/cublas.h",
-        "cublas/include/cublas_v2.h",
-        "cublas/include/cublas_api.h",
-    ],
-    cmd = """cp -f "/usr/local/cuda-10.0/include/cublas.h" "$(location cublas/include/cublas.h)" && \
-cp -f "/usr/local/cuda-10.0/include/cublas_v2.h" "$(location cublas/include/cublas_v2.h)" && \
-cp -f "/usr/local/cuda-10.0/include/cublas_api.h" "$(location cublas/include/cublas_api.h)" """,
-)
-
-genrule(
-    name = "cuda-lib",
-    outs = [
-        "cuda/lib/libcuda.so",
-        "cuda/lib/libcudart.so.10.0",
-        "cuda/lib/libcudart_static.a",
-        "cuda/lib/libcublas.so.10.0",
-        "cuda/lib/libcusolver.so.10.0",
-        "cuda/lib/libcurand.so.10.0",
-        "cuda/lib/libcufft.so.10.0",
-        "cuda/lib/libcudnn.so.7",
-        "cuda/lib/libcupti.so.10.0",
-        "cuda/lib/libcusparse.so.10.0",
-    ],
-    cmd = """cp -f "/usr/local/cuda-10.0/lib64/stubs/libcuda.so" "$(location cuda/lib/libcuda.so)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcudart.so.10.0" "$(location cuda/lib/libcudart.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcudart_static.a" "$(location cuda/lib/libcudart_static.a)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcublas.so.10.0" "$(location cuda/lib/libcublas.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcusolver.so.10.0" "$(location cuda/lib/libcusolver.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcurand.so.10.0" "$(location cuda/lib/libcurand.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcufft.so.10.0" "$(location cuda/lib/libcufft.so.10.0)" && \
-cp -f "/usr/lib/x86_64-linux-gnu/libcudnn.so.7" "$(location cuda/lib/libcudnn.so.7)" && \
-cp -f "/usr/local/cuda-10.0/extras/CUPTI/lib64/libcupti.so.10.0" "$(location cuda/lib/libcupti.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcusparse.so.10.0" "$(location cuda/lib/libcusparse.so.10.0)" """,
-)
-
-genrule(
-    name = "cuda-bin",
-    outs = [
-        "cuda/bin/crt/link.stub",
-        "cuda/bin/nvlink",
-        "cuda/bin/fatbinary",
-        "cuda/bin/bin2c",
-    ],
-    cmd = """cp -f "/usr/local/cuda-10.0/bin/crt/link.stub" "$(location cuda/bin/crt/link.stub)" && \
-cp -f "/usr/local/cuda-10.0/bin/nvlink" "$(location cuda/bin/nvlink)" && \
-cp -f "/usr/local/cuda-10.0/bin/fatbinary" "$(location cuda/bin/fatbinary)" && \
-cp -f "/usr/local/cuda-10.0/bin/bin2c" "$(location cuda/bin/bin2c)" """,
-)
-
-genrule(
-    name = "cudnn-include",
-    outs = [
-        "cudnn/include/cudnn.h",
-    ],
-    cmd = """cp -f "/usr/include/cudnn.h" "$(location cudnn/include/cudnn.h)" """,
-)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/cuda/build_defs.bzl b/third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/cuda/build_defs.bzl
deleted file mode 100755
index 254904c105e..00000000000
--- a/third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/cuda/build_defs.bzl
+++ /dev/null
@@ -1,76 +0,0 @@
-# Macros for building CUDA code.
-def if_cuda(if_true, if_false = []):
-    """Shorthand for select()'ing on whether we're building with CUDA.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with CUDA enabled.  Otherwise, the select statement evaluates to if_false.
-
-    """
-    return select({
-        "@local_config_cuda//cuda:using_nvcc": if_true,
-        "@local_config_cuda//cuda:using_clang": if_true,
-        "//conditions:default": if_false,
-    })
-
-def if_cuda_clang(if_true, if_false = []):
-    """Shorthand for select()'ing on wheteher we're building with cuda-clang.
-
-     Returns a select statement which evaluates to if_true if we're building
-     with cuda-clang.  Otherwise, the select statement evaluates to if_false.
-
-    """
-    return select({
-        "@local_config_cuda//cuda:using_clang": if_true,
-        "//conditions:default": if_false,
-    })
-
-def cuda_default_copts():
-    """Default options for all CUDA compilations."""
-    return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"]) + if_cuda_clang(["--cuda-gpu-arch=sm_30", "--cuda-gpu-arch=sm_60"])
-
-def cuda_is_configured():
-    """Returns true if CUDA was enabled during the configure process."""
-    return True
-
-def if_cuda_is_configured(x):
-    """Tests if the CUDA was enabled during the configure process.
-
-    Unlike if_cuda(), this does not require that we are building with
-    --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries.
-    """
-    if cuda_is_configured():
-        return select({"//conditions:default": x})
-    return select({"//conditions:default": []})
-
-def cuda_header_library(
-        name,
-        hdrs,
-        include_prefix = None,
-        strip_include_prefix = None,
-        deps = [],
-        **kwargs):
-    """Generates a cc_library containing both virtual and system include paths.
-
-    Generates both a header-only target with virtual includes plus the full
-    target without virtual includes. This works around the fact that bazel can't
-    mix 'includes' and 'include_prefix' in the same target."""
-
-    native.cc_library(
-        name = name + "_virtual",
-        hdrs = hdrs,
-        include_prefix = include_prefix,
-        strip_include_prefix = strip_include_prefix,
-        deps = deps,
-        visibility = ["//visibility:private"],
-    )
-
-    native.cc_library(
-        name = name,
-        textual_hdrs = hdrs,
-        deps = deps + [":%s_virtual" % name],
-        **kwargs
-    )
-
-def cuda_library(copts = [], **kwargs):
-    """Wrapper over cc_library which adds default CUDA options."""
-    native.cc_library(copts = cuda_default_copts() + copts, **kwargs)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h b/third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
deleted file mode 100755
index 72a7cf77346..00000000000
--- a/third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef CUDA_CUDA_CONFIG_H_
-#define CUDA_CUDA_CONFIG_H_
-
-#define TF_CUDA_CAPABILITIES CudaVersion("3.0"), CudaVersion("6.0")
-
-#define TF_CUDA_VERSION "10.0"
-#define TF_CUDA_LIB_VERSION "10.0"
-#define TF_CUDNN_VERSION "7"
-
-#define TF_CUDA_TOOLKIT_PATH "/usr/local/cuda-10.0"
-
-#endif  // CUDA_CUDA_CONFIG_H_
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/py3_opt/BUILD b/third_party/toolchains/preconfig/ubuntu16.04/py3_opt/BUILD
deleted file mode 100755
index 2244d81abd0..00000000000
--- a/third_party/toolchains/preconfig/ubuntu16.04/py3_opt/BUILD
+++ /dev/null
@@ -1,209 +0,0 @@
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-# Point both runtimes to the same python binary to ensure we always
-# use the python binary specified by ./configure.py script.
-load("@bazel_tools//tools/python:toolchain.bzl", "py_runtime_pair")
-
-py_runtime(
-    name = "py2_runtime",
-    interpreter_path = "/usr/bin/python3",
-    python_version = "PY2",
-)
-
-py_runtime(
-    name = "py3_runtime",
-    interpreter_path = "/usr/bin/python3",
-    python_version = "PY3",
-)
-
-py_runtime_pair(
-    name = "py_runtime_pair",
-    py2_runtime = ":py2_runtime",
-    py3_runtime = ":py3_runtime",
-)
-
-toolchain(
-    name = "py_toolchain",
-    toolchain = ":py_runtime_pair",
-    toolchain_type = "@bazel_tools//tools/python:toolchain_type",
-)
-
-# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
-# See https://docs.python.org/3/extending/windows.html
-cc_import(
-    name = "python_lib",
-    interface_library = select({
-        ":windows": ":python_import_lib",
-        # A placeholder for Unix platforms which makes --no_build happy.
-        "//conditions:default": "not-existing.lib",
-    }),
-    system_provided = 1,
-)
-
-cc_library(
-    name = "python_headers",
-    hdrs = [":python_include"],
-    includes = ["python_include"],
-    deps = select({
-        ":windows": [":python_lib"],
-        "//conditions:default": [],
-    }),
-)
-
-cc_library(
-    name = "numpy_headers",
-    hdrs = [":numpy_include"],
-    includes = ["numpy_include"],
-)
-
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-genrule(
-    name = "python_include",
-    outs = [
-        "python_include/Python-ast.h",
-        "python_include/Python.h",
-        "python_include/abstract.h",
-        "python_include/accu.h",
-        "python_include/asdl.h",
-        "python_include/ast.h",
-        "python_include/bitset.h",
-        "python_include/bltinmodule.h",
-        "python_include/boolobject.h",
-        "python_include/bytearrayobject.h",
-        "python_include/bytes_methods.h",
-        "python_include/bytesobject.h",
-        "python_include/cellobject.h",
-        "python_include/ceval.h",
-        "python_include/classobject.h",
-        "python_include/code.h",
-        "python_include/codecs.h",
-        "python_include/compile.h",
-        "python_include/complexobject.h",
-        "python_include/datetime.h",
-        "python_include/descrobject.h",
-        "python_include/dictobject.h",
-        "python_include/dtoa.h",
-        "python_include/dynamic_annotations.h",
-        "python_include/enumobject.h",
-        "python_include/errcode.h",
-        "python_include/eval.h",
-        "python_include/fileobject.h",
-        "python_include/fileutils.h",
-        "python_include/floatobject.h",
-        "python_include/frameobject.h",
-        "python_include/funcobject.h",
-        "python_include/genobject.h",
-        "python_include/graminit.h",
-        "python_include/grammar.h",
-        "python_include/import.h",
-        "python_include/intrcheck.h",
-        "python_include/iterobject.h",
-        "python_include/listobject.h",
-        "python_include/longintrepr.h",
-        "python_include/longobject.h",
-        "python_include/marshal.h",
-        "python_include/memoryobject.h",
-        "python_include/metagrammar.h",
-        "python_include/methodobject.h",
-        "python_include/modsupport.h",
-        "python_include/moduleobject.h",
-        "python_include/namespaceobject.h",
-        "python_include/node.h",
-        "python_include/object.h",
-        "python_include/objimpl.h",
-        "python_include/odictobject.h",
-        "python_include/opcode.h",
-        "python_include/osdefs.h",
-        "python_include/osmodule.h",
-        "python_include/parsetok.h",
-        "python_include/patchlevel.h",
-        "python_include/pgen.h",
-        "python_include/pgenheaders.h",
-        "python_include/py_curses.h",
-        "python_include/pyarena.h",
-        "python_include/pyatomic.h",
-        "python_include/pycapsule.h",
-        "python_include/pyconfig.h",
-        "python_include/pyctype.h",
-        "python_include/pydebug.h",
-        "python_include/pydtrace.h",
-        "python_include/pyerrors.h",
-        "python_include/pyexpat.h",
-        "python_include/pyfpe.h",
-        "python_include/pygetopt.h",
-        "python_include/pyhash.h",
-        "python_include/pylifecycle.h",
-        "python_include/pymacconfig.h",
-        "python_include/pymacro.h",
-        "python_include/pymath.h",
-        "python_include/pymem.h",
-        "python_include/pyport.h",
-        "python_include/pystate.h",
-        "python_include/pystrcmp.h",
-        "python_include/pystrhex.h",
-        "python_include/pystrtod.h",
-        "python_include/pythonrun.h",
-        "python_include/pythread.h",
-        "python_include/pytime.h",
-        "python_include/rangeobject.h",
-        "python_include/setobject.h",
-        "python_include/sliceobject.h",
-        "python_include/structmember.h",
-        "python_include/structseq.h",
-        "python_include/symtable.h",
-        "python_include/sysmodule.h",
-        "python_include/token.h",
-        "python_include/traceback.h",
-        "python_include/tupleobject.h",
-        "python_include/typeslots.h",
-        "python_include/ucnhash.h",
-        "python_include/unicodeobject.h",
-        "python_include/warnings.h",
-        "python_include/weakrefobject.h",
-    ],
-    cmd = """
-cp -f "/opt/python3.6/include/python3.6m/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp -f "/opt/python3.6/include/python3.6m/Python.h" "$(@D)/python_include/Python.h" && cp -f "/opt/python3.6/include/python3.6m/abstract.h" "$(@D)/python_include/abstract.h" && cp -f "/opt/python3.6/include/python3.6m/accu.h" "$(@D)/python_include/accu.h" && cp -f "/opt/python3.6/include/python3.6m/asdl.h" "$(@D)/python_include/asdl.h" && cp -f "/opt/python3.6/include/python3.6m/ast.h" "$(@D)/python_include/ast.h" && cp -f "/opt/python3.6/include/python3.6m/bitset.h" "$(@D)/python_include/bitset.h" && cp -f "/opt/python3.6/include/python3.6m/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp -f "/opt/python3.6/include/python3.6m/boolobject.h" "$(@D)/python_include/boolobject.h" && cp -f "/opt/python3.6/include/python3.6m/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp -f "/opt/python3.6/include/python3.6m/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp -f "/opt/python3.6/include/python3.6m/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp -f "/opt/python3.6/include/python3.6m/cellobject.h" "$(@D)/python_include/cellobject.h" && cp -f "/opt/python3.6/include/python3.6m/ceval.h" "$(@D)/python_include/ceval.h" && cp -f "/opt/python3.6/include/python3.6m/classobject.h" "$(@D)/python_include/classobject.h" && cp -f "/opt/python3.6/include/python3.6m/code.h" "$(@D)/python_include/code.h" && cp -f "/opt/python3.6/include/python3.6m/codecs.h" "$(@D)/python_include/codecs.h" && cp -f "/opt/python3.6/include/python3.6m/compile.h" "$(@D)/python_include/compile.h" && cp -f "/opt/python3.6/include/python3.6m/complexobject.h" "$(@D)/python_include/complexobject.h" && cp -f "/opt/python3.6/include/python3.6m/datetime.h" "$(@D)/python_include/datetime.h" && cp -f "/opt/python3.6/include/python3.6m/descrobject.h" "$(@D)/python_include/descrobject.h" && cp -f "/opt/python3.6/include/python3.6m/dictobject.h" "$(@D)/python_include/dictobject.h" && cp -f "/opt/python3.6/include/python3.6m/dtoa.h" "$(@D)/python_include/dtoa.h" && cp -f "/opt/python3.6/include/python3.6m/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp -f "/opt/python3.6/include/python3.6m/enumobject.h" "$(@D)/python_include/enumobject.h" && cp -f "/opt/python3.6/include/python3.6m/errcode.h" "$(@D)/python_include/errcode.h" && cp -f "/opt/python3.6/include/python3.6m/eval.h" "$(@D)/python_include/eval.h" && cp -f "/opt/python3.6/include/python3.6m/fileobject.h" "$(@D)/python_include/fileobject.h" && cp -f "/opt/python3.6/include/python3.6m/fileutils.h" "$(@D)/python_include/fileutils.h" && cp -f "/opt/python3.6/include/python3.6m/floatobject.h" "$(@D)/python_include/floatobject.h" && cp -f "/opt/python3.6/include/python3.6m/frameobject.h" "$(@D)/python_include/frameobject.h" && cp -f "/opt/python3.6/include/python3.6m/funcobject.h" "$(@D)/python_include/funcobject.h" && cp -f "/opt/python3.6/include/python3.6m/genobject.h" "$(@D)/python_include/genobject.h" && cp -f "/opt/python3.6/include/python3.6m/graminit.h" "$(@D)/python_include/graminit.h" && cp -f "/opt/python3.6/include/python3.6m/grammar.h" "$(@D)/python_include/grammar.h" && cp -f "/opt/python3.6/include/python3.6m/import.h" "$(@D)/python_include/import.h" && cp -f "/opt/python3.6/include/python3.6m/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp -f "/opt/python3.6/include/python3.6m/iterobject.h" "$(@D)/python_include/iterobject.h" && cp -f "/opt/python3.6/include/python3.6m/listobject.h" "$(@D)/python_include/listobject.h" && cp -f "/opt/python3.6/include/python3.6m/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp -f "/opt/python3.6/include/python3.6m/longobject.h" "$(@D)/python_include/longobject.h" && cp -f "/opt/python3.6/include/python3.6m/marshal.h" "$(@D)/python_include/marshal.h" && cp -f "/opt/python3.6/include/python3.6m/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp -f "/opt/python3.6/include/python3.6m/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp -f "/opt/python3.6/include/python3.6m/methodobject.h" "$(@D)/python_include/methodobject.h" && cp -f "/opt/python3.6/include/python3.6m/modsupport.h" "$(@D)/python_include/modsupport.h" && cp -f "/opt/python3.6/include/python3.6m/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp -f "/opt/python3.6/include/python3.6m/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp -f "/opt/python3.6/include/python3.6m/node.h" "$(@D)/python_include/node.h" && cp -f "/opt/python3.6/include/python3.6m/object.h" "$(@D)/python_include/object.h" && cp -f "/opt/python3.6/include/python3.6m/objimpl.h" "$(@D)/python_include/objimpl.h" && cp -f "/opt/python3.6/include/python3.6m/odictobject.h" "$(@D)/python_include/odictobject.h" && cp -f "/opt/python3.6/include/python3.6m/opcode.h" "$(@D)/python_include/opcode.h" && cp -f "/opt/python3.6/include/python3.6m/osdefs.h" "$(@D)/python_include/osdefs.h" && cp -f "/opt/python3.6/include/python3.6m/osmodule.h" "$(@D)/python_include/osmodule.h" && cp -f "/opt/python3.6/include/python3.6m/parsetok.h" "$(@D)/python_include/parsetok.h" && cp -f "/opt/python3.6/include/python3.6m/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp -f "/opt/python3.6/include/python3.6m/pgen.h" "$(@D)/python_include/pgen.h" && cp -f "/opt/python3.6/include/python3.6m/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp -f "/opt/python3.6/include/python3.6m/py_curses.h" "$(@D)/python_include/py_curses.h" && cp -f "/opt/python3.6/include/python3.6m/pyarena.h" "$(@D)/python_include/pyarena.h" && cp -f "/opt/python3.6/include/python3.6m/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp -f "/opt/python3.6/include/python3.6m/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp -f "/opt/python3.6/include/python3.6m/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp -f "/opt/python3.6/include/python3.6m/pyctype.h" "$(@D)/python_include/pyctype.h" && cp -f "/opt/python3.6/include/python3.6m/pydebug.h" "$(@D)/python_include/pydebug.h" && cp -f "/opt/python3.6/include/python3.6m/pydtrace.h" "$(@D)/python_include/pydtrace.h" && cp -f "/opt/python3.6/include/python3.6m/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp -f "/opt/python3.6/include/python3.6m/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp -f "/opt/python3.6/include/python3.6m/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp -f "/opt/python3.6/include/python3.6m/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp -f "/opt/python3.6/include/python3.6m/pyhash.h" "$(@D)/python_include/pyhash.h" && cp -f "/opt/python3.6/include/python3.6m/pylifecycle.h" "$(@D)/python_include/pylifecycle.h" && cp -f "/opt/python3.6/include/python3.6m/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp -f "/opt/python3.6/include/python3.6m/pymacro.h" "$(@D)/python_include/pymacro.h" && cp -f "/opt/python3.6/include/python3.6m/pymath.h" "$(@D)/python_include/pymath.h" && cp -f "/opt/python3.6/include/python3.6m/pymem.h" "$(@D)/python_include/pymem.h" && cp -f "/opt/python3.6/include/python3.6m/pyport.h" "$(@D)/python_include/pyport.h" && cp -f "/opt/python3.6/include/python3.6m/pystate.h" "$(@D)/python_include/pystate.h" && cp -f "/opt/python3.6/include/python3.6m/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp -f "/opt/python3.6/include/python3.6m/pystrhex.h" "$(@D)/python_include/pystrhex.h" && cp -f "/opt/python3.6/include/python3.6m/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp -f "/opt/python3.6/include/python3.6m/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp -f "/opt/python3.6/include/python3.6m/pythread.h" "$(@D)/python_include/pythread.h" && cp -f "/opt/python3.6/include/python3.6m/pytime.h" "$(@D)/python_include/pytime.h" && cp -f "/opt/python3.6/include/python3.6m/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp -f "/opt/python3.6/include/python3.6m/setobject.h" "$(@D)/python_include/setobject.h" && cp -f "/opt/python3.6/include/python3.6m/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp -f "/opt/python3.6/include/python3.6m/structmember.h" "$(@D)/python_include/structmember.h" && cp -f "/opt/python3.6/include/python3.6m/structseq.h" "$(@D)/python_include/structseq.h" && cp -f "/opt/python3.6/include/python3.6m/symtable.h" "$(@D)/python_include/symtable.h" && cp -f "/opt/python3.6/include/python3.6m/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp -f "/opt/python3.6/include/python3.6m/token.h" "$(@D)/python_include/token.h" && cp -f "/opt/python3.6/include/python3.6m/traceback.h" "$(@D)/python_include/traceback.h" && cp -f "/opt/python3.6/include/python3.6m/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp -f "/opt/python3.6/include/python3.6m/typeslots.h" "$(@D)/python_include/typeslots.h" && cp -f "/opt/python3.6/include/python3.6m/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp -f "/opt/python3.6/include/python3.6m/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp -f "/opt/python3.6/include/python3.6m/warnings.h" "$(@D)/python_include/warnings.h" && cp -f "/opt/python3.6/include/python3.6m/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
-   """,
-)
-
-genrule(
-    name = "numpy_include",
-    outs = [
-        "numpy_include/numpy/__multiarray_api.h",
-        "numpy_include/numpy/__ufunc_api.h",
-        "numpy_include/numpy/_neighborhood_iterator_imp.h",
-        "numpy_include/numpy/_numpyconfig.h",
-        "numpy_include/numpy/arrayobject.h",
-        "numpy_include/numpy/arrayscalars.h",
-        "numpy_include/numpy/halffloat.h",
-        "numpy_include/numpy/multiarray_api.txt",
-        "numpy_include/numpy/ndarrayobject.h",
-        "numpy_include/numpy/ndarraytypes.h",
-        "numpy_include/numpy/noprefix.h",
-        "numpy_include/numpy/npy_1_7_deprecated_api.h",
-        "numpy_include/numpy/npy_3kcompat.h",
-        "numpy_include/numpy/npy_common.h",
-        "numpy_include/numpy/npy_cpu.h",
-        "numpy_include/numpy/npy_endian.h",
-        "numpy_include/numpy/npy_interrupt.h",
-        "numpy_include/numpy/npy_math.h",
-        "numpy_include/numpy/npy_no_deprecated_api.h",
-        "numpy_include/numpy/npy_os.h",
-        "numpy_include/numpy/numpyconfig.h",
-        "numpy_include/numpy/old_defines.h",
-        "numpy_include/numpy/oldnumeric.h",
-        "numpy_include/numpy/ufunc_api.txt",
-        "numpy_include/numpy/ufuncobject.h",
-        "numpy_include/numpy/utils.h",
-    ],
-    cmd = """
-cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp -f "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
-   """,
-)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/py3_opt/WORKSPACE b/third_party/toolchains/preconfig/ubuntu16.04/py3_opt/WORKSPACE
deleted file mode 100644
index 1d298fefa3b..00000000000
--- a/third_party/toolchains/preconfig/ubuntu16.04/py3_opt/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for python_configure rule
-workspace(name = "local_config_python")
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/rocm/WORKSPACE b/third_party/toolchains/preconfig/ubuntu16.04/rocm/WORKSPACE
deleted file mode 100644
index 6dcd3551ce0..00000000000
--- a/third_party/toolchains/preconfig/ubuntu16.04/rocm/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for rocm_configure rule
-workspace(name = "local_config_rocm")
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/BUILD b/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/BUILD
deleted file mode 100755
index a8217711803..00000000000
--- a/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/BUILD
+++ /dev/null
@@ -1,1512 +0,0 @@
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
-
-package(default_visibility = ["//visibility:public"])
-
-config_setting(
-    name = "using_hipcc",
-    values = {
-        "define": "using_rocm_hipcc=true",
-    },
-)
-
-cc_library(
-    name = "rocm_headers",
-    hdrs = [
-        "rocm/rocm_config.h",
-        ":hipsparse-include",
-        ":miopen-include",
-        ":rccl-include",
-        ":rocblas-include",
-        ":rocfft-include",
-        ":rocm-include",
-    ],
-    includes = [
-        ".",
-        "rocm/include",
-        "rocm/include/rocrand",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "hip",
-    srcs = ["rocm/lib/libhip_hcc.so"],
-    data = ["rocm/lib/libhip_hcc.so"],
-    includes = [
-        ".",
-        "rocm/include",
-    ],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "rocblas",
-    srcs = ["rocm/lib/librocblas.so"],
-    data = ["rocm/lib/librocblas.so"],
-    includes = [
-        ".",
-        "rocm/include",
-    ],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "rocfft",
-    srcs = ["rocm/lib/librocfft.so"],
-    data = ["rocm/lib/librocfft.so"],
-    includes = [
-        ".",
-        "rocm/include",
-    ],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "hiprand",
-    srcs = ["rocm/lib/libhiprand.so"],
-    data = ["rocm/lib/libhiprand.so"],
-    includes = [
-        ".",
-        "rocm/include",
-        "rocm/include/rocrand",
-    ],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "miopen",
-    srcs = ["rocm/lib/libMIOpen.so"],
-    data = ["rocm/lib/libMIOpen.so"],
-    includes = [
-        ".",
-        "rocm/include",
-    ],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "rccl",
-    srcs = ["rocm/lib/librccl.so"],
-    data = ["rocm/lib/librccl.so"],
-    includes = [
-        ".",
-        "rocm/include",
-    ],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "rocm",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":hip",
-        ":hiprand",
-        ":miopen",
-        ":rocblas",
-        ":rocfft",
-        ":rocm_headers",
-    ],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-)
-
-cc_library(
-    name = "rocprim",
-    srcs = [
-        "rocm/include/hipcub/hipcub_version.hpp",
-        "rocm/include/rocprim/rocprim_version.hpp",
-    ],
-    hdrs = glob([
-        "rocm/include/hipcub/**",
-        "rocm/include/rocprim/**",
-    ]),
-    includes = [
-        ".",
-        "rocm/include/hipcub",
-        "rocm/include/rocprim",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        "@local_config_rocm//rocm:rocm_headers",
-    ],
-)
-
-cc_import(
-    name = "hipsparse",
-    hdrs = glob(["rocm/include/hipsparse/**"]),
-    shared_library = "rocm/lib/libhipsparse.so",
-    visibility = ["//visibility:public"],
-)
-
-genrule(
-    name = "rocm-include",
-    outs = [
-        "rocm/include/amd_comgr.h",
-        "rocm/include/amd_hsa_common.h",
-        "rocm/include/amd_hsa_elf.h",
-        "rocm/include/amd_hsa_kernel_code.h",
-        "rocm/include/amd_hsa_queue.h",
-        "rocm/include/amd_hsa_signal.h",
-        "rocm/include/base/backend_manager.hpp",
-        "rocm/include/base/base_rocalution.hpp",
-        "rocm/include/base/global_matrix.hpp",
-        "rocm/include/base/global_vector.hpp",
-        "rocm/include/base/local_matrix.hpp",
-        "rocm/include/base/local_stencil.hpp",
-        "rocm/include/base/local_vector.hpp",
-        "rocm/include/base/matrix_formats.hpp",
-        "rocm/include/base/matrix_formats_ind.hpp",
-        "rocm/include/base/operator.hpp",
-        "rocm/include/base/parallel_manager.hpp",
-        "rocm/include/base/stencil_types.hpp",
-        "rocm/include/base/vector.hpp",
-        "rocm/include/device_amd_hsa.h",
-        "rocm/include/hcc/amd_hsa_common.h",
-        "rocm/include/hcc/amd_hsa_elf.h",
-        "rocm/include/hcc/amd_hsa_kernel_code.h",
-        "rocm/include/hcc/amd_hsa_queue.h",
-        "rocm/include/hcc/amd_hsa_signal.h",
-        "rocm/include/hcc/array_view",
-        "rocm/include/hcc/clang-c/BuildSystem.h",
-        "rocm/include/hcc/clang-c/CXCompilationDatabase.h",
-        "rocm/include/hcc/clang-c/CXErrorCode.h",
-        "rocm/include/hcc/clang-c/CXString.h",
-        "rocm/include/hcc/clang-c/Documentation.h",
-        "rocm/include/hcc/clang-c/FatalErrorHandler.h",
-        "rocm/include/hcc/clang-c/Index.h",
-        "rocm/include/hcc/clang-c/Platform.h",
-        "rocm/include/hcc/coordinate",
-        "rocm/include/hcc/device_amd_hsa.h",
-        "rocm/include/hcc/experimental/algorithm",
-        "rocm/include/hcc/experimental/exception_list",
-        "rocm/include/hcc/experimental/execution_policy",
-        "rocm/include/hcc/experimental/impl/algorithm_impl.inl",
-        "rocm/include/hcc/experimental/impl/algorithm_impl_seq.inl",
-        "rocm/include/hcc/experimental/impl/exclusive_scan.inl",
-        "rocm/include/hcc/experimental/impl/inclusive_scan.inl",
-        "rocm/include/hcc/experimental/impl/kernel_launch.inl",
-        "rocm/include/hcc/experimental/impl/numeric_impl_seq.inl",
-        "rocm/include/hcc/experimental/impl/reduce.inl",
-        "rocm/include/hcc/experimental/impl/scan.inl",
-        "rocm/include/hcc/experimental/impl/sort.inl",
-        "rocm/include/hcc/experimental/impl/stablesort.inl",
-        "rocm/include/hcc/experimental/impl/transform.inl",
-        "rocm/include/hcc/experimental/impl/transform_exclusive_scan.inl",
-        "rocm/include/hcc/experimental/impl/transform_inclusive_scan.inl",
-        "rocm/include/hcc/experimental/impl/transform_reduce.inl",
-        "rocm/include/hcc/experimental/impl/transform_scan.inl",
-        "rocm/include/hcc/experimental/impl/type_utils.inl",
-        "rocm/include/hcc/experimental/numeric",
-        "rocm/include/hcc/grid_launch.h",
-        "rocm/include/hcc/grid_launch.hpp",
-        "rocm/include/hcc/hc.hpp",
-        "rocm/include/hcc/hc_am.hpp",
-        "rocm/include/hcc/hc_am_internal.hpp",
-        "rocm/include/hcc/hc_defines.h",
-        "rocm/include/hcc/hc_math.hpp",
-        "rocm/include/hcc/hc_norm_unorm.inl",
-        "rocm/include/hcc/hc_printf.hpp",
-        "rocm/include/hcc/hc_prof_runtime.h",
-        "rocm/include/hcc/hc_rt_debug.h",
-        "rocm/include/hcc/hc_short_vector.hpp",
-        "rocm/include/hcc/hc_short_vector.inl",
-        "rocm/include/hcc/hcc_features.hpp",
-        "rocm/include/hcc/hsa.h",
-        "rocm/include/hcc/hsa_atomic.h",
-        "rocm/include/hcc/kalmar_aligned_alloc.h",
-        "rocm/include/hcc/kalmar_buffer.h",
-        "rocm/include/hcc/kalmar_cpu_launch.h",
-        "rocm/include/hcc/kalmar_exception.h",
-        "rocm/include/hcc/kalmar_index.h",
-        "rocm/include/hcc/kalmar_launch.h",
-        "rocm/include/hcc/kalmar_math.h",
-        "rocm/include/hcc/kalmar_runtime.h",
-        "rocm/include/hcc/kalmar_serialize.h",
-        "rocm/include/hcc/kalmar_short_vectors.inl",
-        "rocm/include/hcc/llvm-c/Remarks.h",
-        "rocm/include/hcc/llvm-c/lto.h",
-        "rocm/include/hcc/llvm/Target/AMDGPU/AMDGPU.h",
-        "rocm/include/hcc/llvm/Target/AMDGPU/Disassembler/CodeObjectDisassembler.h",
-        "rocm/include/hcc/ockl.h",
-        "rocm/include/hcc/ockl_hsa.h",
-        "rocm/include/hcc/ocml.h",
-        "rocm/include/hcc/pinned_vector.hpp",
-        "rocm/include/hip/channel_descriptor.h",
-        "rocm/include/hip/device_functions.h",
-        "rocm/include/hip/driver_types.h",
-        "rocm/include/hip/hcc_detail/channel_descriptor.h",
-        "rocm/include/hip/hcc_detail/code_object_bundle.hpp",
-        "rocm/include/hip/hcc_detail/concepts.hpp",
-        "rocm/include/hip/hcc_detail/cuda/cuda.h",
-        "rocm/include/hip/hcc_detail/cuda/math_functions.h",
-        "rocm/include/hip/hcc_detail/device_functions.h",
-        "rocm/include/hip/hcc_detail/device_library_decls.h",
-        "rocm/include/hip/hcc_detail/driver_types.h",
-        "rocm/include/hip/hcc_detail/elfio/elf_types.hpp",
-        "rocm/include/hip/hcc_detail/elfio/elfio.hpp",
-        "rocm/include/hip/hcc_detail/elfio/elfio_dump.hpp",
-        "rocm/include/hip/hcc_detail/elfio/elfio_dynamic.hpp",
-        "rocm/include/hip/hcc_detail/elfio/elfio_header.hpp",
-        "rocm/include/hip/hcc_detail/elfio/elfio_note.hpp",
-        "rocm/include/hip/hcc_detail/elfio/elfio_relocation.hpp",
-        "rocm/include/hip/hcc_detail/elfio/elfio_section.hpp",
-        "rocm/include/hip/hcc_detail/elfio/elfio_segment.hpp",
-        "rocm/include/hip/hcc_detail/elfio/elfio_strings.hpp",
-        "rocm/include/hip/hcc_detail/elfio/elfio_symbols.hpp",
-        "rocm/include/hip/hcc_detail/elfio/elfio_utils.hpp",
-        "rocm/include/hip/hcc_detail/functional_grid_launch.hpp",
-        "rocm/include/hip/hcc_detail/grid_launch.h",
-        "rocm/include/hip/hcc_detail/grid_launch.hpp",
-        "rocm/include/hip/hcc_detail/grid_launch_GGL.hpp",
-        "rocm/include/hip/hcc_detail/helpers.hpp",
-        "rocm/include/hip/hcc_detail/hip_atomic.h",
-        "rocm/include/hip/hcc_detail/hip_common.h",
-        "rocm/include/hip/hcc_detail/hip_complex.h",
-        "rocm/include/hip/hcc_detail/hip_cooperative_groups.h",
-        "rocm/include/hip/hcc_detail/hip_cooperative_groups_helper.h",
-        "rocm/include/hip/hcc_detail/hip_db.h",
-        "rocm/include/hip/hcc_detail/hip_fp16.h",
-        "rocm/include/hip/hcc_detail/hip_fp16_gcc.h",
-        "rocm/include/hip/hcc_detail/hip_fp16_math_fwd.h",
-        "rocm/include/hip/hcc_detail/hip_ldg.h",
-        "rocm/include/hip/hcc_detail/hip_memory.h",
-        "rocm/include/hip/hcc_detail/hip_prof_str.h",
-        "rocm/include/hip/hcc_detail/hip_runtime.h",
-        "rocm/include/hip/hcc_detail/hip_runtime_api.h",
-        "rocm/include/hip/hcc_detail/hip_runtime_prof.h",
-        "rocm/include/hip/hcc_detail/hip_surface_types.h",
-        "rocm/include/hip/hcc_detail/hip_texture_types.h",
-        "rocm/include/hip/hcc_detail/hip_vector_types.h",
-        "rocm/include/hip/hcc_detail/hiprtc.h",
-        "rocm/include/hip/hcc_detail/host_defines.h",
-        "rocm/include/hip/hcc_detail/hsa_helpers.hpp",
-        "rocm/include/hip/hcc_detail/library_types.h",
-        "rocm/include/hip/hcc_detail/llvm_intrinsics.h",
-        "rocm/include/hip/hcc_detail/macro_based_grid_launch.hpp",
-        "rocm/include/hip/hcc_detail/math_functions.h",
-        "rocm/include/hip/hcc_detail/math_fwd.h",
-        "rocm/include/hip/hcc_detail/program_state.hpp",
-        "rocm/include/hip/hcc_detail/surface_functions.h",
-        "rocm/include/hip/hcc_detail/texture_functions.h",
-        "rocm/include/hip/hcc_detail/texture_types.h",
-        "rocm/include/hip/hip_common.h",
-        "rocm/include/hip/hip_complex.h",
-        "rocm/include/hip/hip_cooperative_groups.h",
-        "rocm/include/hip/hip_ext.h",
-        "rocm/include/hip/hip_fp16.h",
-        "rocm/include/hip/hip_hcc.h",
-        "rocm/include/hip/hip_profile.h",
-        "rocm/include/hip/hip_runtime.h",
-        "rocm/include/hip/hip_runtime_api.h",
-        "rocm/include/hip/hip_texture_types.h",
-        "rocm/include/hip/hip_vector_types.h",
-        "rocm/include/hip/hiprtc.h",
-        "rocm/include/hip/library_types.h",
-        "rocm/include/hip/math_functions.h",
-        "rocm/include/hip/nvcc_detail/channel_descriptor.h",
-        "rocm/include/hip/nvcc_detail/hip_complex.h",
-        "rocm/include/hip/nvcc_detail/hip_runtime.h",
-        "rocm/include/hip/nvcc_detail/hip_runtime_api.h",
-        "rocm/include/hip/nvcc_detail/hip_texture_types.h",
-        "rocm/include/hip/texture_types.h",
-        "rocm/include/hipblas-export.h",
-        "rocm/include/hipblas-version.h",
-        "rocm/include/hipblas.h",
-        "rocm/include/hipcub/config.hpp",
-        "rocm/include/hipcub/cub/device/device_histogram.hpp",
-        "rocm/include/hipcub/cub/device/device_radix_sort.hpp",
-        "rocm/include/hipcub/cub/device/device_reduce.hpp",
-        "rocm/include/hipcub/cub/device/device_run_length_encode.hpp",
-        "rocm/include/hipcub/cub/device/device_scan.hpp",
-        "rocm/include/hipcub/cub/device/device_segmented_radix_sort.hpp",
-        "rocm/include/hipcub/cub/device/device_segmented_reduce.hpp",
-        "rocm/include/hipcub/cub/device/device_select.hpp",
-        "rocm/include/hipcub/cub/hipcub.hpp",
-        "rocm/include/hipcub/cub/util_allocator.hpp",
-        "rocm/include/hipcub/hipcub.hpp",
-        "rocm/include/hipcub/hipcub_version.hpp",
-        "rocm/include/hipcub/rocprim/block/block_discontinuity.hpp",
-        "rocm/include/hipcub/rocprim/block/block_exchange.hpp",
-        "rocm/include/hipcub/rocprim/block/block_histogram.hpp",
-        "rocm/include/hipcub/rocprim/block/block_load.hpp",
-        "rocm/include/hipcub/rocprim/block/block_load_func.hpp",
-        "rocm/include/hipcub/rocprim/block/block_radix_sort.hpp",
-        "rocm/include/hipcub/rocprim/block/block_reduce.hpp",
-        "rocm/include/hipcub/rocprim/block/block_scan.hpp",
-        "rocm/include/hipcub/rocprim/block/block_store.hpp",
-        "rocm/include/hipcub/rocprim/block/block_store_func.hpp",
-        "rocm/include/hipcub/rocprim/device/device_histogram.hpp",
-        "rocm/include/hipcub/rocprim/device/device_radix_sort.hpp",
-        "rocm/include/hipcub/rocprim/device/device_reduce.hpp",
-        "rocm/include/hipcub/rocprim/device/device_run_length_encode.hpp",
-        "rocm/include/hipcub/rocprim/device/device_scan.hpp",
-        "rocm/include/hipcub/rocprim/device/device_segmented_radix_sort.hpp",
-        "rocm/include/hipcub/rocprim/device/device_segmented_reduce.hpp",
-        "rocm/include/hipcub/rocprim/device/device_select.hpp",
-        "rocm/include/hipcub/rocprim/hipcub.hpp",
-        "rocm/include/hipcub/rocprim/iterator/arg_index_input_iterator.hpp",
-        "rocm/include/hipcub/rocprim/iterator/constant_input_iterator.hpp",
-        "rocm/include/hipcub/rocprim/iterator/counting_input_iterator.hpp",
-        "rocm/include/hipcub/rocprim/iterator/tex_obj_input_iterator.hpp",
-        "rocm/include/hipcub/rocprim/iterator/transform_input_iterator.hpp",
-        "rocm/include/hipcub/rocprim/thread/thread_operators.hpp",
-        "rocm/include/hipcub/rocprim/util_allocator.hpp",
-        "rocm/include/hipcub/rocprim/util_ptx.hpp",
-        "rocm/include/hipcub/rocprim/util_type.hpp",
-        "rocm/include/hipcub/rocprim/warp/warp_reduce.hpp",
-        "rocm/include/hipcub/rocprim/warp/warp_scan.hpp",
-        "rocm/include/hipfft.h",
-        "rocm/include/hiprand/hiprand.h",
-        "rocm/include/hiprand/hiprand.hpp",
-        "rocm/include/hiprand/hiprand_hcc.h",
-        "rocm/include/hiprand/hiprand_kernel.h",
-        "rocm/include/hiprand/hiprand_kernel_hcc.h",
-        "rocm/include/hiprand/hiprand_kernel_nvcc.h",
-        "rocm/include/hiprand/hiprand_mtgp32_host.h",
-        "rocm/include/hiprand/hiprand_nvcc.h",
-        "rocm/include/hiprand/hiprand_version.h",
-        "rocm/include/hipsparse-export.h",
-        "rocm/include/hipsparse-version.h",
-        "rocm/include/hipsparse.h",
-        "rocm/include/hsa.h",
-        "rocm/include/hsa/Brig.h",
-        "rocm/include/hsa/amd_hsa_common.h",
-        "rocm/include/hsa/amd_hsa_elf.h",
-        "rocm/include/hsa/amd_hsa_kernel_code.h",
-        "rocm/include/hsa/amd_hsa_queue.h",
-        "rocm/include/hsa/amd_hsa_signal.h",
-        "rocm/include/hsa/amd_hsa_tools_interfaces.h",
-        "rocm/include/hsa/hsa.h",
-        "rocm/include/hsa/hsa_api_trace.h",
-        "rocm/include/hsa/hsa_ext_amd.h",
-        "rocm/include/hsa/hsa_ext_debugger.h",
-        "rocm/include/hsa/hsa_ext_finalize.h",
-        "rocm/include/hsa/hsa_ext_image.h",
-        "rocm/include/hsa/hsa_ext_profiler.h",
-        "rocm/include/hsa/hsa_ven_amd_aqlprofile.h",
-        "rocm/include/hsa/hsa_ven_amd_loader.h",
-        "rocm/include/hsakmt.h",
-        "rocm/include/hsakmttypes.h",
-        "rocm/include/miopen/config.h",
-        "rocm/include/miopen/export.h",
-        "rocm/include/miopen/miopen.h",
-        "rocm/include/miopen/version.h",
-        "rocm/include/miopen_kernel_includes.h",
-        "rocm/include/miopen_kernels.h",
-        "rocm/include/miopengemm/accuracytests.hpp",
-        "rocm/include/miopengemm/alphagenerator.hpp",
-        "rocm/include/miopengemm/apitest.hpp",
-        "rocm/include/miopengemm/architests.hpp",
-        "rocm/include/miopengemm/basegenerator.hpp",
-        "rocm/include/miopengemm/betacgenerator.hpp",
-        "rocm/include/miopengemm/bundle.hpp",
-        "rocm/include/miopengemm/bylinegenerator.hpp",
-        "rocm/include/miopengemm/copygenerator.hpp",
-        "rocm/include/miopengemm/cpugemm.hpp",
-        "rocm/include/miopengemm/derivedparams.hpp",
-        "rocm/include/miopengemm/enums.hpp",
-        "rocm/include/miopengemm/error.hpp",
-        "rocm/include/miopengemm/findparams.hpp",
-        "rocm/include/miopengemm/floattostring.hpp",
-        "rocm/include/miopengemm/gemm.hpp",
-        "rocm/include/miopengemm/geometries.hpp",
-        "rocm/include/miopengemm/geometry.hpp",
-        "rocm/include/miopengemm/graph.hpp",
-        "rocm/include/miopengemm/hint.hpp",
-        "rocm/include/miopengemm/hyperparams.hpp",
-        "rocm/include/miopengemm/kernelcache.hpp",
-        "rocm/include/miopengemm/kernelcachemerge.hpp",
-        "rocm/include/miopengemm/kernelstring.hpp",
-        "rocm/include/miopengemm/macgrid.hpp",
-        "rocm/include/miopengemm/miogemm.hpp",
-        "rocm/include/miopengemm/nearest.hpp",
-        "rocm/include/miopengemm/normalformgenerator.hpp",
-        "rocm/include/miopengemm/oclutil.hpp",
-        "rocm/include/miopengemm/outputwriter.hpp",
-        "rocm/include/miopengemm/platform.hpp",
-        "rocm/include/miopengemm/prepgenerator.hpp",
-        "rocm/include/miopengemm/programcacher.hpp",
-        "rocm/include/miopengemm/programs.hpp",
-        "rocm/include/miopengemm/randomutil.hpp",
-        "rocm/include/miopengemm/redirection.hpp",
-        "rocm/include/miopengemm/setabcw.hpp",
-        "rocm/include/miopengemm/solution.hpp",
-        "rocm/include/miopengemm/standalone.hpp",
-        "rocm/include/miopengemm/stringutilbase.hpp",
-        "rocm/include/miopengemm/tiling.hpp",
-        "rocm/include/miopengemm/timer.hpp",
-        "rocm/include/miopengemm/tinyone.hpp",
-        "rocm/include/miopengemm/tinytwo.hpp",
-        "rocm/include/miopengemm/tinyzero.hpp",
-        "rocm/include/ockl.h",
-        "rocm/include/ockl_hsa.h",
-        "rocm/include/ocml.h",
-        "rocm/include/opencl1.2-c.pch",
-        "rocm/include/opencl2.0-c.pch",
-        "rocm/include/rccl.h",
-        "rocm/include/rocalution.hpp",
-        "rocm/include/rocblas-auxiliary.h",
-        "rocm/include/rocblas-complex-types.h",
-        "rocm/include/rocblas-export.h",
-        "rocm/include/rocblas-functions.h",
-        "rocm/include/rocblas-types.h",
-        "rocm/include/rocblas-version.h",
-        "rocm/include/rocblas.h",
-        "rocm/include/rocblas_bfloat16.h",
-        "rocm/include/rocfft-export.h",
-        "rocm/include/rocfft-version.h",
-        "rocm/include/rocfft.h",
-        "rocm/include/rocprim/block/block_discontinuity.hpp",
-        "rocm/include/rocprim/block/block_exchange.hpp",
-        "rocm/include/rocprim/block/block_histogram.hpp",
-        "rocm/include/rocprim/block/block_load.hpp",
-        "rocm/include/rocprim/block/block_load_func.hpp",
-        "rocm/include/rocprim/block/block_radix_sort.hpp",
-        "rocm/include/rocprim/block/block_reduce.hpp",
-        "rocm/include/rocprim/block/block_scan.hpp",
-        "rocm/include/rocprim/block/block_sort.hpp",
-        "rocm/include/rocprim/block/block_store.hpp",
-        "rocm/include/rocprim/block/block_store_func.hpp",
-        "rocm/include/rocprim/block/detail/block_histogram_atomic.hpp",
-        "rocm/include/rocprim/block/detail/block_histogram_sort.hpp",
-        "rocm/include/rocprim/block/detail/block_reduce_raking_reduce.hpp",
-        "rocm/include/rocprim/block/detail/block_reduce_warp_reduce.hpp",
-        "rocm/include/rocprim/block/detail/block_scan_reduce_then_scan.hpp",
-        "rocm/include/rocprim/block/detail/block_scan_warp_scan.hpp",
-        "rocm/include/rocprim/block/detail/block_sort_bitonic.hpp",
-        "rocm/include/rocprim/config.hpp",
-        "rocm/include/rocprim/detail/all_true.hpp",
-        "rocm/include/rocprim/detail/binary_op_wrappers.hpp",
-        "rocm/include/rocprim/detail/match_result_type.hpp",
-        "rocm/include/rocprim/detail/radix_sort.hpp",
-        "rocm/include/rocprim/detail/various.hpp",
-        "rocm/include/rocprim/device/config_types.hpp",
-        "rocm/include/rocprim/device/detail/device_binary_search.hpp",
-        "rocm/include/rocprim/device/detail/device_histogram.hpp",
-        "rocm/include/rocprim/device/detail/device_merge.hpp",
-        "rocm/include/rocprim/device/detail/device_merge_sort.hpp",
-        "rocm/include/rocprim/device/detail/device_partition.hpp",
-        "rocm/include/rocprim/device/detail/device_radix_sort.hpp",
-        "rocm/include/rocprim/device/detail/device_reduce.hpp",
-        "rocm/include/rocprim/device/detail/device_reduce_by_key.hpp",
-        "rocm/include/rocprim/device/detail/device_scan_lookback.hpp",
-        "rocm/include/rocprim/device/detail/device_scan_reduce_then_scan.hpp",
-        "rocm/include/rocprim/device/detail/device_segmented_radix_sort.hpp",
-        "rocm/include/rocprim/device/detail/device_segmented_reduce.hpp",
-        "rocm/include/rocprim/device/detail/device_segmented_scan.hpp",
-        "rocm/include/rocprim/device/detail/device_transform.hpp",
-        "rocm/include/rocprim/device/detail/lookback_scan_state.hpp",
-        "rocm/include/rocprim/device/detail/ordered_block_id.hpp",
-        "rocm/include/rocprim/device/detail/uint_fast_div.hpp",
-        "rocm/include/rocprim/device/device_binary_search.hpp",
-        "rocm/include/rocprim/device/device_histogram.hpp",
-        "rocm/include/rocprim/device/device_histogram_config.hpp",
-        "rocm/include/rocprim/device/device_merge.hpp",
-        "rocm/include/rocprim/device/device_merge_config.hpp",
-        "rocm/include/rocprim/device/device_merge_sort.hpp",
-        "rocm/include/rocprim/device/device_merge_sort_config.hpp",
-        "rocm/include/rocprim/device/device_partition.hpp",
-        "rocm/include/rocprim/device/device_radix_sort.hpp",
-        "rocm/include/rocprim/device/device_radix_sort_config.hpp",
-        "rocm/include/rocprim/device/device_reduce.hpp",
-        "rocm/include/rocprim/device/device_reduce_by_key.hpp",
-        "rocm/include/rocprim/device/device_reduce_by_key_config.hpp",
-        "rocm/include/rocprim/device/device_reduce_config.hpp",
-        "rocm/include/rocprim/device/device_run_length_encode.hpp",
-        "rocm/include/rocprim/device/device_run_length_encode_config.hpp",
-        "rocm/include/rocprim/device/device_scan.hpp",
-        "rocm/include/rocprim/device/device_scan_by_key.hpp",
-        "rocm/include/rocprim/device/device_scan_config.hpp",
-        "rocm/include/rocprim/device/device_segmented_radix_sort.hpp",
-        "rocm/include/rocprim/device/device_segmented_radix_sort_config.hpp",
-        "rocm/include/rocprim/device/device_segmented_reduce.hpp",
-        "rocm/include/rocprim/device/device_segmented_scan.hpp",
-        "rocm/include/rocprim/device/device_select.hpp",
-        "rocm/include/rocprim/device/device_select_config.hpp",
-        "rocm/include/rocprim/device/device_transform.hpp",
-        "rocm/include/rocprim/device/device_transform_config.hpp",
-        "rocm/include/rocprim/functional.hpp",
-        "rocm/include/rocprim/intrinsics.hpp",
-        "rocm/include/rocprim/intrinsics/atomic.hpp",
-        "rocm/include/rocprim/intrinsics/bit.hpp",
-        "rocm/include/rocprim/intrinsics/thread.hpp",
-        "rocm/include/rocprim/intrinsics/warp.hpp",
-        "rocm/include/rocprim/intrinsics/warp_shuffle.hpp",
-        "rocm/include/rocprim/iterator.hpp",
-        "rocm/include/rocprim/iterator/arg_index_iterator.hpp",
-        "rocm/include/rocprim/iterator/constant_iterator.hpp",
-        "rocm/include/rocprim/iterator/counting_iterator.hpp",
-        "rocm/include/rocprim/iterator/detail/replace_first_iterator.hpp",
-        "rocm/include/rocprim/iterator/discard_iterator.hpp",
-        "rocm/include/rocprim/iterator/texture_cache_iterator.hpp",
-        "rocm/include/rocprim/iterator/transform_iterator.hpp",
-        "rocm/include/rocprim/iterator/zip_iterator.hpp",
-        "rocm/include/rocprim/rocprim.hpp",
-        "rocm/include/rocprim/rocprim_version.hpp",
-        "rocm/include/rocprim/type_traits.hpp",
-        "rocm/include/rocprim/types.hpp",
-        "rocm/include/rocprim/types/double_buffer.hpp",
-        "rocm/include/rocprim/types/integer_sequence.hpp",
-        "rocm/include/rocprim/types/key_value_pair.hpp",
-        "rocm/include/rocprim/types/tuple.hpp",
-        "rocm/include/rocprim/warp/detail/warp_reduce_crosslane.hpp",
-        "rocm/include/rocprim/warp/detail/warp_reduce_dpp.hpp",
-        "rocm/include/rocprim/warp/detail/warp_reduce_shared_mem.hpp",
-        "rocm/include/rocprim/warp/detail/warp_reduce_shuffle.hpp",
-        "rocm/include/rocprim/warp/detail/warp_scan_crosslane.hpp",
-        "rocm/include/rocprim/warp/detail/warp_scan_dpp.hpp",
-        "rocm/include/rocprim/warp/detail/warp_scan_shared_mem.hpp",
-        "rocm/include/rocprim/warp/detail/warp_scan_shuffle.hpp",
-        "rocm/include/rocprim/warp/detail/warp_segment_bounds.hpp",
-        "rocm/include/rocprim/warp/detail/warp_sort_shuffle.hpp",
-        "rocm/include/rocprim/warp/warp_reduce.hpp",
-        "rocm/include/rocprim/warp/warp_scan.hpp",
-        "rocm/include/rocprim/warp/warp_sort.hpp",
-        "rocm/include/rocprofiler/rocprofiler.h",
-        "rocm/include/rocrand/rocrand.h",
-        "rocm/include/rocrand/rocrand.hpp",
-        "rocm/include/rocrand/rocrand_common.h",
-        "rocm/include/rocrand/rocrand_discrete.h",
-        "rocm/include/rocrand/rocrand_discrete_types.h",
-        "rocm/include/rocrand/rocrand_kernel.h",
-        "rocm/include/rocrand/rocrand_log_normal.h",
-        "rocm/include/rocrand/rocrand_mrg32k3a.h",
-        "rocm/include/rocrand/rocrand_mrg32k3a_precomputed.h",
-        "rocm/include/rocrand/rocrand_mtgp32.h",
-        "rocm/include/rocrand/rocrand_mtgp32_11213.h",
-        "rocm/include/rocrand/rocrand_normal.h",
-        "rocm/include/rocrand/rocrand_philox4x32_10.h",
-        "rocm/include/rocrand/rocrand_poisson.h",
-        "rocm/include/rocrand/rocrand_sobol32.h",
-        "rocm/include/rocrand/rocrand_sobol_precomputed.h",
-        "rocm/include/rocrand/rocrand_uniform.h",
-        "rocm/include/rocrand/rocrand_version.h",
-        "rocm/include/rocrand/rocrand_xorwow.h",
-        "rocm/include/rocrand/rocrand_xorwow_precomputed.h",
-        "rocm/include/rocsparse-auxiliary.h",
-        "rocm/include/rocsparse-complex-types.h",
-        "rocm/include/rocsparse-export.h",
-        "rocm/include/rocsparse-functions.h",
-        "rocm/include/rocsparse-types.h",
-        "rocm/include/rocsparse-version.h",
-        "rocm/include/rocsparse.h",
-        "rocm/include/solvers/chebyshev.hpp",
-        "rocm/include/solvers/direct/inversion.hpp",
-        "rocm/include/solvers/direct/lu.hpp",
-        "rocm/include/solvers/direct/qr.hpp",
-        "rocm/include/solvers/iter_ctrl.hpp",
-        "rocm/include/solvers/krylov/bicgstab.hpp",
-        "rocm/include/solvers/krylov/bicgstabl.hpp",
-        "rocm/include/solvers/krylov/cg.hpp",
-        "rocm/include/solvers/krylov/cr.hpp",
-        "rocm/include/solvers/krylov/fcg.hpp",
-        "rocm/include/solvers/krylov/fgmres.hpp",
-        "rocm/include/solvers/krylov/gmres.hpp",
-        "rocm/include/solvers/krylov/idr.hpp",
-        "rocm/include/solvers/krylov/qmrcgstab.hpp",
-        "rocm/include/solvers/mixed_precision.hpp",
-        "rocm/include/solvers/multigrid/base_amg.hpp",
-        "rocm/include/solvers/multigrid/base_multigrid.hpp",
-        "rocm/include/solvers/multigrid/global_pairwise_amg.hpp",
-        "rocm/include/solvers/multigrid/multigrid.hpp",
-        "rocm/include/solvers/multigrid/pairwise_amg.hpp",
-        "rocm/include/solvers/multigrid/ruge_stueben_amg.hpp",
-        "rocm/include/solvers/multigrid/smoothed_amg.hpp",
-        "rocm/include/solvers/multigrid/unsmoothed_amg.hpp",
-        "rocm/include/solvers/preconditioners/preconditioner.hpp",
-        "rocm/include/solvers/preconditioners/preconditioner_ai.hpp",
-        "rocm/include/solvers/preconditioners/preconditioner_as.hpp",
-        "rocm/include/solvers/preconditioners/preconditioner_blockjacobi.hpp",
-        "rocm/include/solvers/preconditioners/preconditioner_blockprecond.hpp",
-        "rocm/include/solvers/preconditioners/preconditioner_multicolored.hpp",
-        "rocm/include/solvers/preconditioners/preconditioner_multicolored_gs.hpp",
-        "rocm/include/solvers/preconditioners/preconditioner_multicolored_ilu.hpp",
-        "rocm/include/solvers/preconditioners/preconditioner_multielimination.hpp",
-        "rocm/include/solvers/preconditioners/preconditioner_saddlepoint.hpp",
-        "rocm/include/solvers/solver.hpp",
-        "rocm/include/thrust/adjacent_difference.h",
-        "rocm/include/thrust/advance.h",
-        "rocm/include/thrust/binary_search.h",
-        "rocm/include/thrust/complex.h",
-        "rocm/include/thrust/copy.h",
-        "rocm/include/thrust/count.h",
-        "rocm/include/thrust/detail/adjacent_difference.inl",
-        "rocm/include/thrust/detail/advance.inl",
-        "rocm/include/thrust/detail/alignment.h",
-        "rocm/include/thrust/detail/allocator/allocator_traits.h",
-        "rocm/include/thrust/detail/allocator/allocator_traits.inl",
-        "rocm/include/thrust/detail/allocator/copy_construct_range.h",
-        "rocm/include/thrust/detail/allocator/copy_construct_range.inl",
-        "rocm/include/thrust/detail/allocator/default_construct_range.h",
-        "rocm/include/thrust/detail/allocator/default_construct_range.inl",
-        "rocm/include/thrust/detail/allocator/destroy_range.h",
-        "rocm/include/thrust/detail/allocator/destroy_range.inl",
-        "rocm/include/thrust/detail/allocator/fill_construct_range.h",
-        "rocm/include/thrust/detail/allocator/fill_construct_range.inl",
-        "rocm/include/thrust/detail/allocator/malloc_allocator.h",
-        "rocm/include/thrust/detail/allocator/malloc_allocator.inl",
-        "rocm/include/thrust/detail/allocator/no_throw_allocator.h",
-        "rocm/include/thrust/detail/allocator/tagged_allocator.h",
-        "rocm/include/thrust/detail/allocator/tagged_allocator.inl",
-        "rocm/include/thrust/detail/allocator/temporary_allocator.h",
-        "rocm/include/thrust/detail/allocator/temporary_allocator.inl",
-        "rocm/include/thrust/detail/binary_search.inl",
-        "rocm/include/thrust/detail/complex/arithmetic.h",
-        "rocm/include/thrust/detail/complex/c99math.h",
-        "rocm/include/thrust/detail/complex/catrig.h",
-        "rocm/include/thrust/detail/complex/catrigf.h",
-        "rocm/include/thrust/detail/complex/ccosh.h",
-        "rocm/include/thrust/detail/complex/ccoshf.h",
-        "rocm/include/thrust/detail/complex/cexp.h",
-        "rocm/include/thrust/detail/complex/cexpf.h",
-        "rocm/include/thrust/detail/complex/clog.h",
-        "rocm/include/thrust/detail/complex/clogf.h",
-        "rocm/include/thrust/detail/complex/complex.inl",
-        "rocm/include/thrust/detail/complex/cpow.h",
-        "rocm/include/thrust/detail/complex/cproj.h",
-        "rocm/include/thrust/detail/complex/csinh.h",
-        "rocm/include/thrust/detail/complex/csinhf.h",
-        "rocm/include/thrust/detail/complex/csqrt.h",
-        "rocm/include/thrust/detail/complex/csqrtf.h",
-        "rocm/include/thrust/detail/complex/ctanh.h",
-        "rocm/include/thrust/detail/complex/ctanhf.h",
-        "rocm/include/thrust/detail/complex/math_private.h",
-        "rocm/include/thrust/detail/complex/stream.h",
-        "rocm/include/thrust/detail/config.h",
-        "rocm/include/thrust/detail/config/compiler.h",
-        "rocm/include/thrust/detail/config/compiler_fence.h",
-        "rocm/include/thrust/detail/config/config.h",
-        "rocm/include/thrust/detail/config/cpp_dialect.h",
-        "rocm/include/thrust/detail/config/debug.h",
-        "rocm/include/thrust/detail/config/device_system.h",
-        "rocm/include/thrust/detail/config/exec_check_disable.h",
-        "rocm/include/thrust/detail/config/forceinline.h",
-        "rocm/include/thrust/detail/config/global_workarounds.h",
-        "rocm/include/thrust/detail/config/host_device.h",
-        "rocm/include/thrust/detail/config/host_system.h",
-        "rocm/include/thrust/detail/config/simple_defines.h",
-        "rocm/include/thrust/detail/contiguous_storage.h",
-        "rocm/include/thrust/detail/contiguous_storage.inl",
-        "rocm/include/thrust/detail/copy.h",
-        "rocm/include/thrust/detail/copy.inl",
-        "rocm/include/thrust/detail/copy_if.h",
-        "rocm/include/thrust/detail/copy_if.inl",
-        "rocm/include/thrust/detail/count.inl",
-        "rocm/include/thrust/detail/cstdint.h",
-        "rocm/include/thrust/detail/device_delete.inl",
-        "rocm/include/thrust/detail/device_free.inl",
-        "rocm/include/thrust/detail/device_malloc.inl",
-        "rocm/include/thrust/detail/device_new.inl",
-        "rocm/include/thrust/detail/device_ptr.inl",
-        "rocm/include/thrust/detail/device_reference.inl",
-        "rocm/include/thrust/detail/device_vector.inl",
-        "rocm/include/thrust/detail/dispatch/is_trivial_copy.h",
-        "rocm/include/thrust/detail/distance.inl",
-        "rocm/include/thrust/detail/equal.inl",
-        "rocm/include/thrust/detail/execute_with_allocator.h",
-        "rocm/include/thrust/detail/execution_policy.h",
-        "rocm/include/thrust/detail/extrema.inl",
-        "rocm/include/thrust/detail/fill.inl",
-        "rocm/include/thrust/detail/find.inl",
-        "rocm/include/thrust/detail/for_each.inl",
-        "rocm/include/thrust/detail/function.h",
-        "rocm/include/thrust/detail/functional.inl",
-        "rocm/include/thrust/detail/functional/actor.h",
-        "rocm/include/thrust/detail/functional/actor.inl",
-        "rocm/include/thrust/detail/functional/argument.h",
-        "rocm/include/thrust/detail/functional/composite.h",
-        "rocm/include/thrust/detail/functional/operators.h",
-        "rocm/include/thrust/detail/functional/operators/arithmetic_operators.h",
-        "rocm/include/thrust/detail/functional/operators/assignment_operator.h",
-        "rocm/include/thrust/detail/functional/operators/bitwise_operators.h",
-        "rocm/include/thrust/detail/functional/operators/compound_assignment_operators.h",
-        "rocm/include/thrust/detail/functional/operators/logical_operators.h",
-        "rocm/include/thrust/detail/functional/operators/operator_adaptors.h",
-        "rocm/include/thrust/detail/functional/operators/relational_operators.h",
-        "rocm/include/thrust/detail/functional/placeholder.h",
-        "rocm/include/thrust/detail/functional/value.h",
-        "rocm/include/thrust/detail/gather.inl",
-        "rocm/include/thrust/detail/generate.inl",
-        "rocm/include/thrust/detail/get_iterator_value.h",
-        "rocm/include/thrust/detail/host_vector.inl",
-        "rocm/include/thrust/detail/inner_product.inl",
-        "rocm/include/thrust/detail/integer_math.h",
-        "rocm/include/thrust/detail/integer_traits.h",
-        "rocm/include/thrust/detail/internal_functional.h",
-        "rocm/include/thrust/detail/logical.inl",
-        "rocm/include/thrust/detail/malloc_and_free.h",
-        "rocm/include/thrust/detail/merge.inl",
-        "rocm/include/thrust/detail/minmax.h",
-        "rocm/include/thrust/detail/mismatch.inl",
-        "rocm/include/thrust/detail/mpl/math.h",
-        "rocm/include/thrust/detail/numeric_traits.h",
-        "rocm/include/thrust/detail/overlapped_copy.h",
-        "rocm/include/thrust/detail/pair.inl",
-        "rocm/include/thrust/detail/partition.inl",
-        "rocm/include/thrust/detail/pointer.h",
-        "rocm/include/thrust/detail/pointer.inl",
-        "rocm/include/thrust/detail/preprocessor.h",
-        "rocm/include/thrust/detail/range/head_flags.h",
-        "rocm/include/thrust/detail/range/tail_flags.h",
-        "rocm/include/thrust/detail/raw_pointer_cast.h",
-        "rocm/include/thrust/detail/raw_reference_cast.h",
-        "rocm/include/thrust/detail/reduce.inl",
-        "rocm/include/thrust/detail/reference.h",
-        "rocm/include/thrust/detail/reference.inl",
-        "rocm/include/thrust/detail/reference_forward_declaration.h",
-        "rocm/include/thrust/detail/remove.inl",
-        "rocm/include/thrust/detail/replace.inl",
-        "rocm/include/thrust/detail/reverse.inl",
-        "rocm/include/thrust/detail/scan.inl",
-        "rocm/include/thrust/detail/scatter.inl",
-        "rocm/include/thrust/detail/seq.h",
-        "rocm/include/thrust/detail/sequence.inl",
-        "rocm/include/thrust/detail/set_operations.inl",
-        "rocm/include/thrust/detail/sort.inl",
-        "rocm/include/thrust/detail/static_assert.h",
-        "rocm/include/thrust/detail/static_map.h",
-        "rocm/include/thrust/detail/swap.h",
-        "rocm/include/thrust/detail/swap.inl",
-        "rocm/include/thrust/detail/swap_ranges.inl",
-        "rocm/include/thrust/detail/tabulate.inl",
-        "rocm/include/thrust/detail/temporary_array.h",
-        "rocm/include/thrust/detail/temporary_array.inl",
-        "rocm/include/thrust/detail/temporary_buffer.h",
-        "rocm/include/thrust/detail/transform.inl",
-        "rocm/include/thrust/detail/transform_reduce.inl",
-        "rocm/include/thrust/detail/transform_scan.inl",
-        "rocm/include/thrust/detail/trivial_sequence.h",
-        "rocm/include/thrust/detail/tuple.inl",
-        "rocm/include/thrust/detail/tuple_meta_transform.h",
-        "rocm/include/thrust/detail/tuple_transform.h",
-        "rocm/include/thrust/detail/type_traits.h",
-        "rocm/include/thrust/detail/type_traits/algorithm/intermediate_type.h",
-        "rocm/include/thrust/detail/type_traits/function_traits.h",
-        "rocm/include/thrust/detail/type_traits/has_member_function.h",
-        "rocm/include/thrust/detail/type_traits/has_nested_type.h",
-        "rocm/include/thrust/detail/type_traits/has_trivial_assign.h",
-        "rocm/include/thrust/detail/type_traits/is_call_possible.h",
-        "rocm/include/thrust/detail/type_traits/is_metafunction_defined.h",
-        "rocm/include/thrust/detail/type_traits/iterator/is_discard_iterator.h",
-        "rocm/include/thrust/detail/type_traits/iterator/is_output_iterator.h",
-        "rocm/include/thrust/detail/type_traits/minimum_type.h",
-        "rocm/include/thrust/detail/type_traits/pointer_traits.h",
-        "rocm/include/thrust/detail/type_traits/result_of_adaptable_function.h",
-        "rocm/include/thrust/detail/uninitialized_copy.inl",
-        "rocm/include/thrust/detail/uninitialized_fill.inl",
-        "rocm/include/thrust/detail/unique.inl",
-        "rocm/include/thrust/detail/use_default.h",
-        "rocm/include/thrust/detail/util/align.h",
-        "rocm/include/thrust/detail/util/blocking.h",
-        "rocm/include/thrust/detail/vector_base.h",
-        "rocm/include/thrust/detail/vector_base.inl",
-        "rocm/include/thrust/device_allocator.h",
-        "rocm/include/thrust/device_delete.h",
-        "rocm/include/thrust/device_free.h",
-        "rocm/include/thrust/device_malloc.h",
-        "rocm/include/thrust/device_malloc_allocator.h",
-        "rocm/include/thrust/device_new.h",
-        "rocm/include/thrust/device_new_allocator.h",
-        "rocm/include/thrust/device_ptr.h",
-        "rocm/include/thrust/device_reference.h",
-        "rocm/include/thrust/device_vector.h",
-        "rocm/include/thrust/distance.h",
-        "rocm/include/thrust/equal.h",
-        "rocm/include/thrust/execution_policy.h",
-        "rocm/include/thrust/extrema.h",
-        "rocm/include/thrust/fill.h",
-        "rocm/include/thrust/find.h",
-        "rocm/include/thrust/for_each.h",
-        "rocm/include/thrust/functional.h",
-        "rocm/include/thrust/gather.h",
-        "rocm/include/thrust/generate.h",
-        "rocm/include/thrust/host_vector.h",
-        "rocm/include/thrust/inner_product.h",
-        "rocm/include/thrust/iterator/constant_iterator.h",
-        "rocm/include/thrust/iterator/counting_iterator.h",
-        "rocm/include/thrust/iterator/detail/any_assign.h",
-        "rocm/include/thrust/iterator/detail/any_system_tag.h",
-        "rocm/include/thrust/iterator/detail/constant_iterator_base.h",
-        "rocm/include/thrust/iterator/detail/counting_iterator.inl",
-        "rocm/include/thrust/iterator/detail/device_system_tag.h",
-        "rocm/include/thrust/iterator/detail/discard_iterator_base.h",
-        "rocm/include/thrust/iterator/detail/distance_from_result.h",
-        "rocm/include/thrust/iterator/detail/host_system_tag.h",
-        "rocm/include/thrust/iterator/detail/is_iterator_category.h",
-        "rocm/include/thrust/iterator/detail/is_trivial_iterator.h",
-        "rocm/include/thrust/iterator/detail/iterator_adaptor_base.h",
-        "rocm/include/thrust/iterator/detail/iterator_category_to_system.h",
-        "rocm/include/thrust/iterator/detail/iterator_category_to_traversal.h",
-        "rocm/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h",
-        "rocm/include/thrust/iterator/detail/iterator_facade_category.h",
-        "rocm/include/thrust/iterator/detail/iterator_traits.inl",
-        "rocm/include/thrust/iterator/detail/iterator_traversal_tags.h",
-        "rocm/include/thrust/iterator/detail/join_iterator.h",
-        "rocm/include/thrust/iterator/detail/minimum_category.h",
-        "rocm/include/thrust/iterator/detail/minimum_system.h",
-        "rocm/include/thrust/iterator/detail/normal_iterator.h",
-        "rocm/include/thrust/iterator/detail/permutation_iterator_base.h",
-        "rocm/include/thrust/iterator/detail/retag.h",
-        "rocm/include/thrust/iterator/detail/reverse_iterator.inl",
-        "rocm/include/thrust/iterator/detail/reverse_iterator_base.h",
-        "rocm/include/thrust/iterator/detail/tagged_iterator.h",
-        "rocm/include/thrust/iterator/detail/transform_iterator.inl",
-        "rocm/include/thrust/iterator/detail/transform_output_iterator.inl",
-        "rocm/include/thrust/iterator/detail/tuple_of_iterator_references.h",
-        "rocm/include/thrust/iterator/detail/universal_categories.h",
-        "rocm/include/thrust/iterator/detail/zip_iterator.inl",
-        "rocm/include/thrust/iterator/detail/zip_iterator_base.h",
-        "rocm/include/thrust/iterator/discard_iterator.h",
-        "rocm/include/thrust/iterator/iterator_adaptor.h",
-        "rocm/include/thrust/iterator/iterator_categories.h",
-        "rocm/include/thrust/iterator/iterator_facade.h",
-        "rocm/include/thrust/iterator/iterator_traits.h",
-        "rocm/include/thrust/iterator/permutation_iterator.h",
-        "rocm/include/thrust/iterator/retag.h",
-        "rocm/include/thrust/iterator/reverse_iterator.h",
-        "rocm/include/thrust/iterator/transform_iterator.h",
-        "rocm/include/thrust/iterator/transform_output_iterator.h",
-        "rocm/include/thrust/iterator/zip_iterator.h",
-        "rocm/include/thrust/logical.h",
-        "rocm/include/thrust/memory.h",
-        "rocm/include/thrust/merge.h",
-        "rocm/include/thrust/mismatch.h",
-        "rocm/include/thrust/pair.h",
-        "rocm/include/thrust/partition.h",
-        "rocm/include/thrust/random.h",
-        "rocm/include/thrust/random/detail/discard_block_engine.inl",
-        "rocm/include/thrust/random/detail/erfcinv.h",
-        "rocm/include/thrust/random/detail/linear_congruential_engine.inl",
-        "rocm/include/thrust/random/detail/linear_congruential_engine_discard.h",
-        "rocm/include/thrust/random/detail/linear_feedback_shift_engine.inl",
-        "rocm/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h",
-        "rocm/include/thrust/random/detail/mod.h",
-        "rocm/include/thrust/random/detail/normal_distribution.inl",
-        "rocm/include/thrust/random/detail/normal_distribution_base.h",
-        "rocm/include/thrust/random/detail/random_core_access.h",
-        "rocm/include/thrust/random/detail/subtract_with_carry_engine.inl",
-        "rocm/include/thrust/random/detail/uniform_int_distribution.inl",
-        "rocm/include/thrust/random/detail/uniform_real_distribution.inl",
-        "rocm/include/thrust/random/detail/xor_combine_engine.inl",
-        "rocm/include/thrust/random/detail/xor_combine_engine_max.h",
-        "rocm/include/thrust/random/discard_block_engine.h",
-        "rocm/include/thrust/random/linear_congruential_engine.h",
-        "rocm/include/thrust/random/linear_feedback_shift_engine.h",
-        "rocm/include/thrust/random/normal_distribution.h",
-        "rocm/include/thrust/random/subtract_with_carry_engine.h",
-        "rocm/include/thrust/random/uniform_int_distribution.h",
-        "rocm/include/thrust/random/uniform_real_distribution.h",
-        "rocm/include/thrust/random/xor_combine_engine.h",
-        "rocm/include/thrust/reduce.h",
-        "rocm/include/thrust/remove.h",
-        "rocm/include/thrust/replace.h",
-        "rocm/include/thrust/reverse.h",
-        "rocm/include/thrust/rocthrust_version.hpp",
-        "rocm/include/thrust/rocthrust_version.hpp.in",
-        "rocm/include/thrust/scan.h",
-        "rocm/include/thrust/scatter.h",
-        "rocm/include/thrust/sequence.h",
-        "rocm/include/thrust/set_operations.h",
-        "rocm/include/thrust/sort.h",
-        "rocm/include/thrust/swap.h",
-        "rocm/include/thrust/system/cpp/detail/adjacent_difference.h",
-        "rocm/include/thrust/system/cpp/detail/assign_value.h",
-        "rocm/include/thrust/system/cpp/detail/binary_search.h",
-        "rocm/include/thrust/system/cpp/detail/copy.h",
-        "rocm/include/thrust/system/cpp/detail/copy_if.h",
-        "rocm/include/thrust/system/cpp/detail/count.h",
-        "rocm/include/thrust/system/cpp/detail/equal.h",
-        "rocm/include/thrust/system/cpp/detail/execution_policy.h",
-        "rocm/include/thrust/system/cpp/detail/extrema.h",
-        "rocm/include/thrust/system/cpp/detail/fill.h",
-        "rocm/include/thrust/system/cpp/detail/find.h",
-        "rocm/include/thrust/system/cpp/detail/for_each.h",
-        "rocm/include/thrust/system/cpp/detail/gather.h",
-        "rocm/include/thrust/system/cpp/detail/generate.h",
-        "rocm/include/thrust/system/cpp/detail/get_value.h",
-        "rocm/include/thrust/system/cpp/detail/inner_product.h",
-        "rocm/include/thrust/system/cpp/detail/iter_swap.h",
-        "rocm/include/thrust/system/cpp/detail/logical.h",
-        "rocm/include/thrust/system/cpp/detail/malloc_and_free.h",
-        "rocm/include/thrust/system/cpp/detail/memory.inl",
-        "rocm/include/thrust/system/cpp/detail/merge.h",
-        "rocm/include/thrust/system/cpp/detail/mismatch.h",
-        "rocm/include/thrust/system/cpp/detail/par.h",
-        "rocm/include/thrust/system/cpp/detail/partition.h",
-        "rocm/include/thrust/system/cpp/detail/reduce.h",
-        "rocm/include/thrust/system/cpp/detail/reduce_by_key.h",
-        "rocm/include/thrust/system/cpp/detail/remove.h",
-        "rocm/include/thrust/system/cpp/detail/replace.h",
-        "rocm/include/thrust/system/cpp/detail/reverse.h",
-        "rocm/include/thrust/system/cpp/detail/scan.h",
-        "rocm/include/thrust/system/cpp/detail/scan_by_key.h",
-        "rocm/include/thrust/system/cpp/detail/scatter.h",
-        "rocm/include/thrust/system/cpp/detail/sequence.h",
-        "rocm/include/thrust/system/cpp/detail/set_operations.h",
-        "rocm/include/thrust/system/cpp/detail/sort.h",
-        "rocm/include/thrust/system/cpp/detail/swap_ranges.h",
-        "rocm/include/thrust/system/cpp/detail/tabulate.h",
-        "rocm/include/thrust/system/cpp/detail/temporary_buffer.h",
-        "rocm/include/thrust/system/cpp/detail/transform.h",
-        "rocm/include/thrust/system/cpp/detail/transform_reduce.h",
-        "rocm/include/thrust/system/cpp/detail/transform_scan.h",
-        "rocm/include/thrust/system/cpp/detail/uninitialized_copy.h",
-        "rocm/include/thrust/system/cpp/detail/uninitialized_fill.h",
-        "rocm/include/thrust/system/cpp/detail/unique.h",
-        "rocm/include/thrust/system/cpp/detail/unique_by_key.h",
-        "rocm/include/thrust/system/cpp/detail/vector.inl",
-        "rocm/include/thrust/system/cpp/execution_policy.h",
-        "rocm/include/thrust/system/cpp/memory.h",
-        "rocm/include/thrust/system/cpp/vector.h",
-        "rocm/include/thrust/system/cuda/config.h",
-        "rocm/include/thrust/system/cuda/detail/adjacent_difference.h",
-        "rocm/include/thrust/system/cuda/detail/assign_value.h",
-        "rocm/include/thrust/system/cuda/detail/binary_search.h",
-        "rocm/include/thrust/system/cuda/detail/copy.h",
-        "rocm/include/thrust/system/cuda/detail/copy_if.h",
-        "rocm/include/thrust/system/cuda/detail/core/agent_launcher.h",
-        "rocm/include/thrust/system/cuda/detail/core/alignment.h",
-        "rocm/include/thrust/system/cuda/detail/core/triple_chevron_launch.h",
-        "rocm/include/thrust/system/cuda/detail/core/util.h",
-        "rocm/include/thrust/system/cuda/detail/count.h",
-        "rocm/include/thrust/system/cuda/detail/cross_system.h",
-        "rocm/include/thrust/system/cuda/detail/equal.h",
-        "rocm/include/thrust/system/cuda/detail/error.inl",
-        "rocm/include/thrust/system/cuda/detail/execution_policy.h",
-        "rocm/include/thrust/system/cuda/detail/extrema.h",
-        "rocm/include/thrust/system/cuda/detail/fill.h",
-        "rocm/include/thrust/system/cuda/detail/find.h",
-        "rocm/include/thrust/system/cuda/detail/for_each.h",
-        "rocm/include/thrust/system/cuda/detail/gather.h",
-        "rocm/include/thrust/system/cuda/detail/generate.h",
-        "rocm/include/thrust/system/cuda/detail/get_value.h",
-        "rocm/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h",
-        "rocm/include/thrust/system/cuda/detail/guarded_driver_types.h",
-        "rocm/include/thrust/system/cuda/detail/inner_product.h",
-        "rocm/include/thrust/system/cuda/detail/internal/copy_cross_system.h",
-        "rocm/include/thrust/system/cuda/detail/internal/copy_device_to_device.h",
-        "rocm/include/thrust/system/cuda/detail/iter_swap.h",
-        "rocm/include/thrust/system/cuda/detail/logical.h",
-        "rocm/include/thrust/system/cuda/detail/malloc_and_free.h",
-        "rocm/include/thrust/system/cuda/detail/memory.inl",
-        "rocm/include/thrust/system/cuda/detail/memory_buffer.h",
-        "rocm/include/thrust/system/cuda/detail/merge.h",
-        "rocm/include/thrust/system/cuda/detail/mismatch.h",
-        "rocm/include/thrust/system/cuda/detail/par.h",
-        "rocm/include/thrust/system/cuda/detail/par_to_seq.h",
-        "rocm/include/thrust/system/cuda/detail/parallel_for.h",
-        "rocm/include/thrust/system/cuda/detail/partition.h",
-        "rocm/include/thrust/system/cuda/detail/reduce.h",
-        "rocm/include/thrust/system/cuda/detail/reduce_by_key.h",
-        "rocm/include/thrust/system/cuda/detail/remove.h",
-        "rocm/include/thrust/system/cuda/detail/replace.h",
-        "rocm/include/thrust/system/cuda/detail/reverse.h",
-        "rocm/include/thrust/system/cuda/detail/scan.h",
-        "rocm/include/thrust/system/cuda/detail/scan_by_key.h",
-        "rocm/include/thrust/system/cuda/detail/scatter.h",
-        "rocm/include/thrust/system/cuda/detail/sequence.h",
-        "rocm/include/thrust/system/cuda/detail/set_operations.h",
-        "rocm/include/thrust/system/cuda/detail/sort.h",
-        "rocm/include/thrust/system/cuda/detail/swap_ranges.h",
-        "rocm/include/thrust/system/cuda/detail/tabulate.h",
-        "rocm/include/thrust/system/cuda/detail/temporary_buffer.h",
-        "rocm/include/thrust/system/cuda/detail/terminate.h",
-        "rocm/include/thrust/system/cuda/detail/transform.h",
-        "rocm/include/thrust/system/cuda/detail/transform_reduce.h",
-        "rocm/include/thrust/system/cuda/detail/transform_scan.h",
-        "rocm/include/thrust/system/cuda/detail/uninitialized_copy.h",
-        "rocm/include/thrust/system/cuda/detail/uninitialized_fill.h",
-        "rocm/include/thrust/system/cuda/detail/unique.h",
-        "rocm/include/thrust/system/cuda/detail/unique_by_key.h",
-        "rocm/include/thrust/system/cuda/detail/util.h",
-        "rocm/include/thrust/system/cuda/detail/vector.inl",
-        "rocm/include/thrust/system/cuda/error.h",
-        "rocm/include/thrust/system/cuda/execution_policy.h",
-        "rocm/include/thrust/system/cuda/experimental/pinned_allocator.h",
-        "rocm/include/thrust/system/cuda/memory.h",
-        "rocm/include/thrust/system/cuda/vector.h",
-        "rocm/include/thrust/system/detail/adl/adjacent_difference.h",
-        "rocm/include/thrust/system/detail/adl/assign_value.h",
-        "rocm/include/thrust/system/detail/adl/binary_search.h",
-        "rocm/include/thrust/system/detail/adl/copy.h",
-        "rocm/include/thrust/system/detail/adl/copy_if.h",
-        "rocm/include/thrust/system/detail/adl/count.h",
-        "rocm/include/thrust/system/detail/adl/equal.h",
-        "rocm/include/thrust/system/detail/adl/extrema.h",
-        "rocm/include/thrust/system/detail/adl/fill.h",
-        "rocm/include/thrust/system/detail/adl/find.h",
-        "rocm/include/thrust/system/detail/adl/for_each.h",
-        "rocm/include/thrust/system/detail/adl/gather.h",
-        "rocm/include/thrust/system/detail/adl/generate.h",
-        "rocm/include/thrust/system/detail/adl/get_value.h",
-        "rocm/include/thrust/system/detail/adl/inner_product.h",
-        "rocm/include/thrust/system/detail/adl/iter_swap.h",
-        "rocm/include/thrust/system/detail/adl/logical.h",
-        "rocm/include/thrust/system/detail/adl/malloc_and_free.h",
-        "rocm/include/thrust/system/detail/adl/merge.h",
-        "rocm/include/thrust/system/detail/adl/mismatch.h",
-        "rocm/include/thrust/system/detail/adl/partition.h",
-        "rocm/include/thrust/system/detail/adl/reduce.h",
-        "rocm/include/thrust/system/detail/adl/reduce_by_key.h",
-        "rocm/include/thrust/system/detail/adl/remove.h",
-        "rocm/include/thrust/system/detail/adl/replace.h",
-        "rocm/include/thrust/system/detail/adl/reverse.h",
-        "rocm/include/thrust/system/detail/adl/scan.h",
-        "rocm/include/thrust/system/detail/adl/scan_by_key.h",
-        "rocm/include/thrust/system/detail/adl/scatter.h",
-        "rocm/include/thrust/system/detail/adl/sequence.h",
-        "rocm/include/thrust/system/detail/adl/set_operations.h",
-        "rocm/include/thrust/system/detail/adl/sort.h",
-        "rocm/include/thrust/system/detail/adl/swap_ranges.h",
-        "rocm/include/thrust/system/detail/adl/tabulate.h",
-        "rocm/include/thrust/system/detail/adl/temporary_buffer.h",
-        "rocm/include/thrust/system/detail/adl/transform.h",
-        "rocm/include/thrust/system/detail/adl/transform_reduce.h",
-        "rocm/include/thrust/system/detail/adl/transform_scan.h",
-        "rocm/include/thrust/system/detail/adl/uninitialized_copy.h",
-        "rocm/include/thrust/system/detail/adl/uninitialized_fill.h",
-        "rocm/include/thrust/system/detail/adl/unique.h",
-        "rocm/include/thrust/system/detail/adl/unique_by_key.h",
-        "rocm/include/thrust/system/detail/bad_alloc.h",
-        "rocm/include/thrust/system/detail/errno.h",
-        "rocm/include/thrust/system/detail/error_category.inl",
-        "rocm/include/thrust/system/detail/error_code.inl",
-        "rocm/include/thrust/system/detail/error_condition.inl",
-        "rocm/include/thrust/system/detail/generic/adjacent_difference.h",
-        "rocm/include/thrust/system/detail/generic/adjacent_difference.inl",
-        "rocm/include/thrust/system/detail/generic/advance.h",
-        "rocm/include/thrust/system/detail/generic/advance.inl",
-        "rocm/include/thrust/system/detail/generic/binary_search.h",
-        "rocm/include/thrust/system/detail/generic/binary_search.inl",
-        "rocm/include/thrust/system/detail/generic/copy.h",
-        "rocm/include/thrust/system/detail/generic/copy.inl",
-        "rocm/include/thrust/system/detail/generic/copy_if.h",
-        "rocm/include/thrust/system/detail/generic/copy_if.inl",
-        "rocm/include/thrust/system/detail/generic/count.h",
-        "rocm/include/thrust/system/detail/generic/count.inl",
-        "rocm/include/thrust/system/detail/generic/distance.h",
-        "rocm/include/thrust/system/detail/generic/distance.inl",
-        "rocm/include/thrust/system/detail/generic/equal.h",
-        "rocm/include/thrust/system/detail/generic/equal.inl",
-        "rocm/include/thrust/system/detail/generic/extrema.h",
-        "rocm/include/thrust/system/detail/generic/extrema.inl",
-        "rocm/include/thrust/system/detail/generic/fill.h",
-        "rocm/include/thrust/system/detail/generic/find.h",
-        "rocm/include/thrust/system/detail/generic/find.inl",
-        "rocm/include/thrust/system/detail/generic/for_each.h",
-        "rocm/include/thrust/system/detail/generic/gather.h",
-        "rocm/include/thrust/system/detail/generic/gather.inl",
-        "rocm/include/thrust/system/detail/generic/generate.h",
-        "rocm/include/thrust/system/detail/generic/generate.inl",
-        "rocm/include/thrust/system/detail/generic/inner_product.h",
-        "rocm/include/thrust/system/detail/generic/inner_product.inl",
-        "rocm/include/thrust/system/detail/generic/logical.h",
-        "rocm/include/thrust/system/detail/generic/memory.h",
-        "rocm/include/thrust/system/detail/generic/memory.inl",
-        "rocm/include/thrust/system/detail/generic/merge.h",
-        "rocm/include/thrust/system/detail/generic/merge.inl",
-        "rocm/include/thrust/system/detail/generic/mismatch.h",
-        "rocm/include/thrust/system/detail/generic/mismatch.inl",
-        "rocm/include/thrust/system/detail/generic/partition.h",
-        "rocm/include/thrust/system/detail/generic/partition.inl",
-        "rocm/include/thrust/system/detail/generic/reduce.h",
-        "rocm/include/thrust/system/detail/generic/reduce.inl",
-        "rocm/include/thrust/system/detail/generic/reduce_by_key.h",
-        "rocm/include/thrust/system/detail/generic/reduce_by_key.inl",
-        "rocm/include/thrust/system/detail/generic/remove.h",
-        "rocm/include/thrust/system/detail/generic/remove.inl",
-        "rocm/include/thrust/system/detail/generic/replace.h",
-        "rocm/include/thrust/system/detail/generic/replace.inl",
-        "rocm/include/thrust/system/detail/generic/reverse.h",
-        "rocm/include/thrust/system/detail/generic/reverse.inl",
-        "rocm/include/thrust/system/detail/generic/scalar/binary_search.h",
-        "rocm/include/thrust/system/detail/generic/scalar/binary_search.inl",
-        "rocm/include/thrust/system/detail/generic/scan.h",
-        "rocm/include/thrust/system/detail/generic/scan.inl",
-        "rocm/include/thrust/system/detail/generic/scan_by_key.h",
-        "rocm/include/thrust/system/detail/generic/scan_by_key.inl",
-        "rocm/include/thrust/system/detail/generic/scatter.h",
-        "rocm/include/thrust/system/detail/generic/scatter.inl",
-        "rocm/include/thrust/system/detail/generic/select_system.h",
-        "rocm/include/thrust/system/detail/generic/sequence.h",
-        "rocm/include/thrust/system/detail/generic/sequence.inl",
-        "rocm/include/thrust/system/detail/generic/set_operations.h",
-        "rocm/include/thrust/system/detail/generic/set_operations.inl",
-        "rocm/include/thrust/system/detail/generic/sort.h",
-        "rocm/include/thrust/system/detail/generic/sort.inl",
-        "rocm/include/thrust/system/detail/generic/swap_ranges.h",
-        "rocm/include/thrust/system/detail/generic/swap_ranges.inl",
-        "rocm/include/thrust/system/detail/generic/tabulate.h",
-        "rocm/include/thrust/system/detail/generic/tabulate.inl",
-        "rocm/include/thrust/system/detail/generic/tag.h",
-        "rocm/include/thrust/system/detail/generic/temporary_buffer.h",
-        "rocm/include/thrust/system/detail/generic/temporary_buffer.inl",
-        "rocm/include/thrust/system/detail/generic/transform.h",
-        "rocm/include/thrust/system/detail/generic/transform.inl",
-        "rocm/include/thrust/system/detail/generic/transform_reduce.h",
-        "rocm/include/thrust/system/detail/generic/transform_reduce.inl",
-        "rocm/include/thrust/system/detail/generic/transform_scan.h",
-        "rocm/include/thrust/system/detail/generic/transform_scan.inl",
-        "rocm/include/thrust/system/detail/generic/type_traits.h",
-        "rocm/include/thrust/system/detail/generic/uninitialized_copy.h",
-        "rocm/include/thrust/system/detail/generic/uninitialized_copy.inl",
-        "rocm/include/thrust/system/detail/generic/uninitialized_fill.h",
-        "rocm/include/thrust/system/detail/generic/uninitialized_fill.inl",
-        "rocm/include/thrust/system/detail/generic/unique.h",
-        "rocm/include/thrust/system/detail/generic/unique.inl",
-        "rocm/include/thrust/system/detail/generic/unique_by_key.h",
-        "rocm/include/thrust/system/detail/generic/unique_by_key.inl",
-        "rocm/include/thrust/system/detail/internal/decompose.h",
-        "rocm/include/thrust/system/detail/sequential/adjacent_difference.h",
-        "rocm/include/thrust/system/detail/sequential/assign_value.h",
-        "rocm/include/thrust/system/detail/sequential/binary_search.h",
-        "rocm/include/thrust/system/detail/sequential/copy.h",
-        "rocm/include/thrust/system/detail/sequential/copy.inl",
-        "rocm/include/thrust/system/detail/sequential/copy_backward.h",
-        "rocm/include/thrust/system/detail/sequential/copy_if.h",
-        "rocm/include/thrust/system/detail/sequential/count.h",
-        "rocm/include/thrust/system/detail/sequential/equal.h",
-        "rocm/include/thrust/system/detail/sequential/execution_policy.h",
-        "rocm/include/thrust/system/detail/sequential/extrema.h",
-        "rocm/include/thrust/system/detail/sequential/fill.h",
-        "rocm/include/thrust/system/detail/sequential/find.h",
-        "rocm/include/thrust/system/detail/sequential/for_each.h",
-        "rocm/include/thrust/system/detail/sequential/gather.h",
-        "rocm/include/thrust/system/detail/sequential/general_copy.h",
-        "rocm/include/thrust/system/detail/sequential/generate.h",
-        "rocm/include/thrust/system/detail/sequential/get_value.h",
-        "rocm/include/thrust/system/detail/sequential/inner_product.h",
-        "rocm/include/thrust/system/detail/sequential/insertion_sort.h",
-        "rocm/include/thrust/system/detail/sequential/iter_swap.h",
-        "rocm/include/thrust/system/detail/sequential/logical.h",
-        "rocm/include/thrust/system/detail/sequential/malloc_and_free.h",
-        "rocm/include/thrust/system/detail/sequential/merge.h",
-        "rocm/include/thrust/system/detail/sequential/merge.inl",
-        "rocm/include/thrust/system/detail/sequential/mismatch.h",
-        "rocm/include/thrust/system/detail/sequential/partition.h",
-        "rocm/include/thrust/system/detail/sequential/reduce.h",
-        "rocm/include/thrust/system/detail/sequential/reduce_by_key.h",
-        "rocm/include/thrust/system/detail/sequential/remove.h",
-        "rocm/include/thrust/system/detail/sequential/replace.h",
-        "rocm/include/thrust/system/detail/sequential/reverse.h",
-        "rocm/include/thrust/system/detail/sequential/scan.h",
-        "rocm/include/thrust/system/detail/sequential/scan_by_key.h",
-        "rocm/include/thrust/system/detail/sequential/scatter.h",
-        "rocm/include/thrust/system/detail/sequential/sequence.h",
-        "rocm/include/thrust/system/detail/sequential/set_operations.h",
-        "rocm/include/thrust/system/detail/sequential/sort.h",
-        "rocm/include/thrust/system/detail/sequential/sort.inl",
-        "rocm/include/thrust/system/detail/sequential/stable_merge_sort.h",
-        "rocm/include/thrust/system/detail/sequential/stable_merge_sort.inl",
-        "rocm/include/thrust/system/detail/sequential/stable_primitive_sort.h",
-        "rocm/include/thrust/system/detail/sequential/stable_primitive_sort.inl",
-        "rocm/include/thrust/system/detail/sequential/stable_radix_sort.h",
-        "rocm/include/thrust/system/detail/sequential/stable_radix_sort.inl",
-        "rocm/include/thrust/system/detail/sequential/swap_ranges.h",
-        "rocm/include/thrust/system/detail/sequential/tabulate.h",
-        "rocm/include/thrust/system/detail/sequential/temporary_buffer.h",
-        "rocm/include/thrust/system/detail/sequential/transform.h",
-        "rocm/include/thrust/system/detail/sequential/transform_reduce.h",
-        "rocm/include/thrust/system/detail/sequential/transform_scan.h",
-        "rocm/include/thrust/system/detail/sequential/trivial_copy.h",
-        "rocm/include/thrust/system/detail/sequential/uninitialized_copy.h",
-        "rocm/include/thrust/system/detail/sequential/uninitialized_fill.h",
-        "rocm/include/thrust/system/detail/sequential/unique.h",
-        "rocm/include/thrust/system/detail/sequential/unique_by_key.h",
-        "rocm/include/thrust/system/detail/system_error.inl",
-        "rocm/include/thrust/system/error_code.h",
-        "rocm/include/thrust/system/hip/config.h",
-        "rocm/include/thrust/system/hip/detail/adjacent_difference.h",
-        "rocm/include/thrust/system/hip/detail/assign_value.h",
-        "rocm/include/thrust/system/hip/detail/binary_search.h",
-        "rocm/include/thrust/system/hip/detail/copy.h",
-        "rocm/include/thrust/system/hip/detail/copy_if.h",
-        "rocm/include/thrust/system/hip/detail/count.h",
-        "rocm/include/thrust/system/hip/detail/cross_system.h",
-        "rocm/include/thrust/system/hip/detail/equal.h",
-        "rocm/include/thrust/system/hip/detail/error.inl",
-        "rocm/include/thrust/system/hip/detail/execution_policy.h",
-        "rocm/include/thrust/system/hip/detail/extrema.h",
-        "rocm/include/thrust/system/hip/detail/fill.h",
-        "rocm/include/thrust/system/hip/detail/find.h",
-        "rocm/include/thrust/system/hip/detail/for_each.h",
-        "rocm/include/thrust/system/hip/detail/gather.h",
-        "rocm/include/thrust/system/hip/detail/generate.h",
-        "rocm/include/thrust/system/hip/detail/get_value.h",
-        "rocm/include/thrust/system/hip/detail/guarded_driver_types.h",
-        "rocm/include/thrust/system/hip/detail/guarded_hip_runtime_api.h",
-        "rocm/include/thrust/system/hip/detail/inner_product.h",
-        "rocm/include/thrust/system/hip/detail/internal/copy_cross_system.h",
-        "rocm/include/thrust/system/hip/detail/internal/copy_device_to_device.h",
-        "rocm/include/thrust/system/hip/detail/iter_swap.h",
-        "rocm/include/thrust/system/hip/detail/logical.h",
-        "rocm/include/thrust/system/hip/detail/malloc_and_free.h",
-        "rocm/include/thrust/system/hip/detail/memory.inl",
-        "rocm/include/thrust/system/hip/detail/memory_buffer.h",
-        "rocm/include/thrust/system/hip/detail/merge.h",
-        "rocm/include/thrust/system/hip/detail/mismatch.h",
-        "rocm/include/thrust/system/hip/detail/par.h",
-        "rocm/include/thrust/system/hip/detail/par_to_seq.h",
-        "rocm/include/thrust/system/hip/detail/parallel_for.h",
-        "rocm/include/thrust/system/hip/detail/partition.h",
-        "rocm/include/thrust/system/hip/detail/reduce.h",
-        "rocm/include/thrust/system/hip/detail/reduce_by_key.h",
-        "rocm/include/thrust/system/hip/detail/remove.h",
-        "rocm/include/thrust/system/hip/detail/replace.h",
-        "rocm/include/thrust/system/hip/detail/reverse.h",
-        "rocm/include/thrust/system/hip/detail/scan.h",
-        "rocm/include/thrust/system/hip/detail/scan_by_key.h",
-        "rocm/include/thrust/system/hip/detail/scatter.h",
-        "rocm/include/thrust/system/hip/detail/sequence.h",
-        "rocm/include/thrust/system/hip/detail/set_operations.h",
-        "rocm/include/thrust/system/hip/detail/sort.h",
-        "rocm/include/thrust/system/hip/detail/swap_ranges.h",
-        "rocm/include/thrust/system/hip/detail/tabulate.h",
-        "rocm/include/thrust/system/hip/detail/temporary_buffer.h",
-        "rocm/include/thrust/system/hip/detail/terminate.h",
-        "rocm/include/thrust/system/hip/detail/transform.h",
-        "rocm/include/thrust/system/hip/detail/transform_reduce.h",
-        "rocm/include/thrust/system/hip/detail/transform_scan.h",
-        "rocm/include/thrust/system/hip/detail/uninitialized_copy.h",
-        "rocm/include/thrust/system/hip/detail/uninitialized_fill.h",
-        "rocm/include/thrust/system/hip/detail/unique.h",
-        "rocm/include/thrust/system/hip/detail/unique_by_key.h",
-        "rocm/include/thrust/system/hip/detail/util.h",
-        "rocm/include/thrust/system/hip/detail/vector.inl",
-        "rocm/include/thrust/system/hip/error.h",
-        "rocm/include/thrust/system/hip/execution_policy.h",
-        "rocm/include/thrust/system/hip/memory.h",
-        "rocm/include/thrust/system/hip/pointer.h",
-        "rocm/include/thrust/system/hip/vector.h",
-        "rocm/include/thrust/system/omp/detail/adjacent_difference.h",
-        "rocm/include/thrust/system/omp/detail/assign_value.h",
-        "rocm/include/thrust/system/omp/detail/binary_search.h",
-        "rocm/include/thrust/system/omp/detail/copy.h",
-        "rocm/include/thrust/system/omp/detail/copy.inl",
-        "rocm/include/thrust/system/omp/detail/copy_if.h",
-        "rocm/include/thrust/system/omp/detail/copy_if.inl",
-        "rocm/include/thrust/system/omp/detail/count.h",
-        "rocm/include/thrust/system/omp/detail/default_decomposition.h",
-        "rocm/include/thrust/system/omp/detail/default_decomposition.inl",
-        "rocm/include/thrust/system/omp/detail/equal.h",
-        "rocm/include/thrust/system/omp/detail/execution_policy.h",
-        "rocm/include/thrust/system/omp/detail/extrema.h",
-        "rocm/include/thrust/system/omp/detail/fill.h",
-        "rocm/include/thrust/system/omp/detail/find.h",
-        "rocm/include/thrust/system/omp/detail/for_each.h",
-        "rocm/include/thrust/system/omp/detail/for_each.inl",
-        "rocm/include/thrust/system/omp/detail/gather.h",
-        "rocm/include/thrust/system/omp/detail/generate.h",
-        "rocm/include/thrust/system/omp/detail/get_value.h",
-        "rocm/include/thrust/system/omp/detail/inner_product.h",
-        "rocm/include/thrust/system/omp/detail/iter_swap.h",
-        "rocm/include/thrust/system/omp/detail/logical.h",
-        "rocm/include/thrust/system/omp/detail/malloc_and_free.h",
-        "rocm/include/thrust/system/omp/detail/memory.inl",
-        "rocm/include/thrust/system/omp/detail/merge.h",
-        "rocm/include/thrust/system/omp/detail/mismatch.h",
-        "rocm/include/thrust/system/omp/detail/par.h",
-        "rocm/include/thrust/system/omp/detail/partition.h",
-        "rocm/include/thrust/system/omp/detail/partition.inl",
-        "rocm/include/thrust/system/omp/detail/reduce.h",
-        "rocm/include/thrust/system/omp/detail/reduce.inl",
-        "rocm/include/thrust/system/omp/detail/reduce_by_key.h",
-        "rocm/include/thrust/system/omp/detail/reduce_by_key.inl",
-        "rocm/include/thrust/system/omp/detail/reduce_intervals.h",
-        "rocm/include/thrust/system/omp/detail/reduce_intervals.inl",
-        "rocm/include/thrust/system/omp/detail/remove.h",
-        "rocm/include/thrust/system/omp/detail/remove.inl",
-        "rocm/include/thrust/system/omp/detail/replace.h",
-        "rocm/include/thrust/system/omp/detail/reverse.h",
-        "rocm/include/thrust/system/omp/detail/scan.h",
-        "rocm/include/thrust/system/omp/detail/scan_by_key.h",
-        "rocm/include/thrust/system/omp/detail/scatter.h",
-        "rocm/include/thrust/system/omp/detail/sequence.h",
-        "rocm/include/thrust/system/omp/detail/set_operations.h",
-        "rocm/include/thrust/system/omp/detail/sort.h",
-        "rocm/include/thrust/system/omp/detail/sort.inl",
-        "rocm/include/thrust/system/omp/detail/swap_ranges.h",
-        "rocm/include/thrust/system/omp/detail/tabulate.h",
-        "rocm/include/thrust/system/omp/detail/temporary_buffer.h",
-        "rocm/include/thrust/system/omp/detail/transform.h",
-        "rocm/include/thrust/system/omp/detail/transform_reduce.h",
-        "rocm/include/thrust/system/omp/detail/transform_scan.h",
-        "rocm/include/thrust/system/omp/detail/uninitialized_copy.h",
-        "rocm/include/thrust/system/omp/detail/uninitialized_fill.h",
-        "rocm/include/thrust/system/omp/detail/unique.h",
-        "rocm/include/thrust/system/omp/detail/unique.inl",
-        "rocm/include/thrust/system/omp/detail/unique_by_key.h",
-        "rocm/include/thrust/system/omp/detail/unique_by_key.inl",
-        "rocm/include/thrust/system/omp/detail/vector.inl",
-        "rocm/include/thrust/system/omp/execution_policy.h",
-        "rocm/include/thrust/system/omp/memory.h",
-        "rocm/include/thrust/system/omp/vector.h",
-        "rocm/include/thrust/system/system_error.h",
-        "rocm/include/thrust/system/tbb/detail/adjacent_difference.h",
-        "rocm/include/thrust/system/tbb/detail/assign_value.h",
-        "rocm/include/thrust/system/tbb/detail/binary_search.h",
-        "rocm/include/thrust/system/tbb/detail/copy.h",
-        "rocm/include/thrust/system/tbb/detail/copy.inl",
-        "rocm/include/thrust/system/tbb/detail/copy_if.h",
-        "rocm/include/thrust/system/tbb/detail/copy_if.inl",
-        "rocm/include/thrust/system/tbb/detail/count.h",
-        "rocm/include/thrust/system/tbb/detail/equal.h",
-        "rocm/include/thrust/system/tbb/detail/execution_policy.h",
-        "rocm/include/thrust/system/tbb/detail/extrema.h",
-        "rocm/include/thrust/system/tbb/detail/fill.h",
-        "rocm/include/thrust/system/tbb/detail/find.h",
-        "rocm/include/thrust/system/tbb/detail/for_each.h",
-        "rocm/include/thrust/system/tbb/detail/for_each.inl",
-        "rocm/include/thrust/system/tbb/detail/gather.h",
-        "rocm/include/thrust/system/tbb/detail/generate.h",
-        "rocm/include/thrust/system/tbb/detail/get_value.h",
-        "rocm/include/thrust/system/tbb/detail/inner_product.h",
-        "rocm/include/thrust/system/tbb/detail/iter_swap.h",
-        "rocm/include/thrust/system/tbb/detail/logical.h",
-        "rocm/include/thrust/system/tbb/detail/malloc_and_free.h",
-        "rocm/include/thrust/system/tbb/detail/memory.inl",
-        "rocm/include/thrust/system/tbb/detail/merge.h",
-        "rocm/include/thrust/system/tbb/detail/merge.inl",
-        "rocm/include/thrust/system/tbb/detail/mismatch.h",
-        "rocm/include/thrust/system/tbb/detail/par.h",
-        "rocm/include/thrust/system/tbb/detail/partition.h",
-        "rocm/include/thrust/system/tbb/detail/partition.inl",
-        "rocm/include/thrust/system/tbb/detail/reduce.h",
-        "rocm/include/thrust/system/tbb/detail/reduce.inl",
-        "rocm/include/thrust/system/tbb/detail/reduce_by_key.h",
-        "rocm/include/thrust/system/tbb/detail/reduce_by_key.inl",
-        "rocm/include/thrust/system/tbb/detail/reduce_intervals.h",
-        "rocm/include/thrust/system/tbb/detail/remove.h",
-        "rocm/include/thrust/system/tbb/detail/remove.inl",
-        "rocm/include/thrust/system/tbb/detail/replace.h",
-        "rocm/include/thrust/system/tbb/detail/reverse.h",
-        "rocm/include/thrust/system/tbb/detail/scan.h",
-        "rocm/include/thrust/system/tbb/detail/scan.inl",
-        "rocm/include/thrust/system/tbb/detail/scan_by_key.h",
-        "rocm/include/thrust/system/tbb/detail/scatter.h",
-        "rocm/include/thrust/system/tbb/detail/sequence.h",
-        "rocm/include/thrust/system/tbb/detail/set_operations.h",
-        "rocm/include/thrust/system/tbb/detail/sort.h",
-        "rocm/include/thrust/system/tbb/detail/sort.inl",
-        "rocm/include/thrust/system/tbb/detail/swap_ranges.h",
-        "rocm/include/thrust/system/tbb/detail/tabulate.h",
-        "rocm/include/thrust/system/tbb/detail/temporary_buffer.h",
-        "rocm/include/thrust/system/tbb/detail/transform.h",
-        "rocm/include/thrust/system/tbb/detail/transform_reduce.h",
-        "rocm/include/thrust/system/tbb/detail/transform_scan.h",
-        "rocm/include/thrust/system/tbb/detail/uninitialized_copy.h",
-        "rocm/include/thrust/system/tbb/detail/uninitialized_fill.h",
-        "rocm/include/thrust/system/tbb/detail/unique.h",
-        "rocm/include/thrust/system/tbb/detail/unique.inl",
-        "rocm/include/thrust/system/tbb/detail/unique_by_key.h",
-        "rocm/include/thrust/system/tbb/detail/unique_by_key.inl",
-        "rocm/include/thrust/system/tbb/detail/vector.inl",
-        "rocm/include/thrust/system/tbb/execution_policy.h",
-        "rocm/include/thrust/system/tbb/memory.h",
-        "rocm/include/thrust/system/tbb/vector.h",
-        "rocm/include/thrust/system_error.h",
-        "rocm/include/thrust/tabulate.h",
-        "rocm/include/thrust/transform.h",
-        "rocm/include/thrust/transform_reduce.h",
-        "rocm/include/thrust/transform_scan.h",
-        "rocm/include/thrust/tuple.h",
-        "rocm/include/thrust/uninitialized_copy.h",
-        "rocm/include/thrust/uninitialized_fill.h",
-        "rocm/include/thrust/unique.h",
-        "rocm/include/thrust/version.h",
-        "rocm/include/utils/allocate_free.hpp",
-        "rocm/include/utils/def.hpp",
-        "rocm/include/utils/time_functions.hpp",
-        "rocm/include/utils/types.hpp",
-        "rocm/include/version.hpp",
-    ],
-    cmd = """cp -rLf "/opt/rocm/include/." "$(@D)/rocm/include/" """,
-)
-
-genrule(
-    name = "rocfft-include",
-    outs = [
-        "rocm/include/rocfft/hipfft.h",
-        "rocm/include/rocfft/rocfft-export.h",
-        "rocm/include/rocfft/rocfft-version.h",
-        "rocm/include/rocfft/rocfft.h",
-    ],
-    cmd = """cp -rLf "/opt/rocm/rocfft/include/." "$(@D)/rocm/include/rocfft/" """,
-)
-
-genrule(
-    name = "rocblas-include",
-    outs = [
-        "rocm/include/rocblas/rocblas-auxiliary.h",
-        "rocm/include/rocblas/rocblas-complex-types.h",
-        "rocm/include/rocblas/rocblas-export.h",
-        "rocm/include/rocblas/rocblas-functions.h",
-        "rocm/include/rocblas/rocblas-types.h",
-        "rocm/include/rocblas/rocblas-version.h",
-        "rocm/include/rocblas/rocblas.h",
-        "rocm/include/rocblas/rocblas_bfloat16.h",
-    ],
-    cmd = """cp -rLf "/opt/rocm/rocblas/include/." "$(@D)/rocm/include/rocblas/" """,
-)
-
-genrule(
-    name = "miopen-include",
-    outs = [
-        "rocm/include/miopen/miopen/config.h",
-        "rocm/include/miopen/miopen/export.h",
-        "rocm/include/miopen/miopen/miopen.h",
-        "rocm/include/miopen/miopen/version.h",
-        "rocm/include/miopen/miopen_kernel_includes.h",
-        "rocm/include/miopen/miopen_kernels.h",
-    ],
-    cmd = """cp -rLf "/opt/rocm/miopen/include/." "$(@D)/rocm/include/miopen/" """,
-)
-
-genrule(
-    name = "rccl-include",
-    outs = [
-        "rocm/include/rccl/rccl.h",
-    ],
-    cmd = """cp -rLf "/opt/rocm/rccl/include/." "$(@D)/" """,
-)
-
-genrule(
-    name = "hipsparse-include",
-    outs = [
-        "rocm/include/hipsparse/hipsparse-export.h",
-        "rocm/include/hipsparse/hipsparse-version.h",
-        "rocm/include/hipsparse/hipsparse.h",
-    ],
-    cmd = """cp -rLf "/opt/rocm/hipsparse/include/." "$(@D)/rocm/include/hipsparse/" """,
-)
-
-genrule(
-    name = "rocm-lib",
-    outs = [
-        "rocm/lib/libhip_hcc.so",
-        "rocm/lib/librocblas.so",
-        "rocm/lib/librocfft.so",
-        "rocm/lib/libhiprand.so",
-        "rocm/lib/libMIOpen.so",
-        "rocm/lib/librccl.so",
-        "rocm/lib/libhipsparse.so",
-    ],
-    cmd = """cp -f "/opt/rocm/hip/lib/libhip_hcc.so" "$(location rocm/lib/libhip_hcc.so)" && \
-cp -f "/opt/rocm/rocblas/lib/librocblas.so.0.1" "$(location rocm/lib/librocblas.so)" && \
-cp -f "/opt/rocm/rocfft/lib/librocfft.so.0.1" "$(location rocm/lib/librocfft.so)" && \
-cp -f "/opt/rocm/hiprand/lib/libhiprand.so.1.1" "$(location rocm/lib/libhiprand.so)" && \
-cp -f "/opt/rocm/miopen/lib/libMIOpen.so.1" "$(location rocm/lib/libMIOpen.so)" && \
-cp -f "/opt/rocm/rccl/lib/librccl.so" "$(location rocm/lib/librccl.so)" && \
-cp -f "/opt/rocm/hipsparse/lib/libhipsparse.so.0.1" "$(location rocm/lib/libhipsparse.so)" """,
-)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/build_defs.bzl b/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/build_defs.bzl
deleted file mode 100755
index 2d43007ef84..00000000000
--- a/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/build_defs.bzl
+++ /dev/null
@@ -1,44 +0,0 @@
-# Macros for building ROCm code.
-def if_rocm(if_true, if_false = []):
-    """Shorthand for select()'ing on whether we're building with ROCm.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with ROCm enabled.  Otherwise, the select statement evaluates to if_false.
-
-    """
-    return select({
-        "@local_config_rocm//rocm:using_hipcc": if_true,
-        "//conditions:default": if_false,
-    })
-
-def rocm_default_copts():
-    """Default options for all ROCm compilations."""
-    return if_rocm(["-x", "rocm"] + [])
-
-def rocm_copts(opts = []):
-    """Gets the appropriate set of copts for (maybe) ROCm compilation.
-
-      If we're doing ROCm compilation, returns copts for our particular ROCm
-      compiler.  If we're not doing ROCm compilation, returns an empty list.
-
-      """
-    return rocm_default_copts() + select({
-        "//conditions:default": [],
-        "@local_config_rocm//rocm:using_hipcc": ([
-            "",
-        ]),
-    }) + if_rocm_is_configured(opts)
-
-def rocm_is_configured():
-    """Returns true if ROCm was enabled during the configure process."""
-    return True
-
-def if_rocm_is_configured(x):
-    """Tests if the ROCm was enabled during the configure process.
-
-    Unlike if_rocm(), this does not require that we are building with
-    --config=rocm. Used to allow non-ROCm code to depend on ROCm libraries.
-    """
-    if rocm_is_configured():
-        return x
-    return []
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/rocm/rocm_config.h b/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/rocm/rocm_config.h
deleted file mode 100755
index c5f25a845ca..00000000000
--- a/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/rocm/rocm_config.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef ROCM_ROCM_CONFIG_H_
-#define ROCM_ROCM_CONFIG_H_
-
-#define TF_ROCM_TOOLKIT_PATH "/opt/rocm"
-
-#endif  // ROCM_ROCM_CONFIG_H_
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/BUILD b/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/BUILD
deleted file mode 100755
index 88980d1014a..00000000000
--- a/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/BUILD
+++ /dev/null
@@ -1,63 +0,0 @@
-# NVIDIA TensorRT
-# A high-performance deep learning inference optimizer and runtime.
-
-licenses(["notice"])
-
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-package(default_visibility = ["//visibility:public"])
-
-exports_files(["LICENSE"])
-
-cc_library(
-    name = "tensorrt_headers",
-    hdrs = [
-        "tensorrt/include/tensorrt_config.h",
-        ":tensorrt_include",
-    ],
-    include_prefix = "third_party/tensorrt",
-    strip_include_prefix = "tensorrt/include",
-)
-
-cc_library(
-    name = "tensorrt",
-    srcs = [":tensorrt_lib"],
-    copts = cuda_default_copts(),
-    data = [":tensorrt_lib"],
-    linkstatic = 1,
-    deps = [
-        ":tensorrt_headers",
-        "@local_config_cuda//cuda",
-    ],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    deps = [
-        "@bazel_skylib//lib:selects",
-    ],
-)
-
-genrule(
-    name = "tensorrt_lib",
-    outs = [
-        "tensorrt/lib/libnvinfer.so.5",
-        "tensorrt/lib/libnvinfer_plugin.so.5",
-    ],
-    cmd = """cp -f "/usr/lib/x86_64-linux-gnu/libnvinfer.so.5" "$(location tensorrt/lib/libnvinfer.so.5)" && \
-cp -f "/usr/lib/x86_64-linux-gnu/libnvinfer_plugin.so.5" "$(location tensorrt/lib/libnvinfer_plugin.so.5)" """,
-)
-
-genrule(
-    name = "tensorrt_include",
-    outs = [
-        "tensorrt/include/NvInfer.h",
-        "tensorrt/include/NvUtils.h",
-        "tensorrt/include/NvInferPlugin.h",
-    ],
-    cmd = """cp -f "/usr/include/x86_64-linux-gnu/NvInfer.h" "$(location tensorrt/include/NvInfer.h)" && \
-cp -f "/usr/include/x86_64-linux-gnu/NvUtils.h" "$(location tensorrt/include/NvUtils.h)" && \
-cp -f "/usr/include/x86_64-linux-gnu/NvInferPlugin.h" "$(location tensorrt/include/NvInferPlugin.h)" """,
-)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/LICENSE b/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/LICENSE
deleted file mode 100755
index 146d9b765c5..00000000000
--- a/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/LICENSE
+++ /dev/null
@@ -1,203 +0,0 @@
-Copyright 2018 The TensorFlow Authors.  All rights reserved.
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2018, The TensorFlow Authors.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/WORKSPACE b/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/WORKSPACE
deleted file mode 100644
index ce47f14b91b..00000000000
--- a/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for tensorrt_configure rule
-workspace(name = "local_config_tensorrt")
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/build_defs.bzl b/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/build_defs.bzl
deleted file mode 100755
index 527be938341..00000000000
--- a/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/build_defs.bzl
+++ /dev/null
@@ -1,5 +0,0 @@
-# Build configurations for TensorRT.
-
-def if_tensorrt(if_true, if_false = []):
-    """Tests whether TensorRT was enabled during the configure process."""
-    return if_true
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/tensorrt/include/tensorrt_config.h b/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/tensorrt/include/tensorrt_config.h
deleted file mode 100755
index 02a166f4cd1..00000000000
--- a/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/tensorrt/include/tensorrt_config.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORRT_TENSORRT_INCLUDE_CONFIG_H_
-#define TENSORRT_TENSORRT_INCLUDE_CONFIG_H_
-
-#define TF_TENSORRT_VERSION "5"
-
-#endif  // TENSORRT_TENSORRT_INCLUDE_CONFIG_H_

From ec445c6d8b4685a104df423235f074f4c8def13f Mon Sep 17 00:00:00 2001
From: Terry Heo <terryheo@google.com>
Date: Thu, 20 Feb 2020 02:04:59 -0800
Subject: [PATCH 343/442] Fix iOS build failure

Removed absl::StrCat() usage.

PiperOrigin-RevId: 296159687
Change-Id: I1f825cbda293c7e218921563c3602777b9e29810
---
 tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc
index 7a93fc6d670..18430f8e71f 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc
@@ -57,8 +57,7 @@ std::string GetElementwiseWithTwoInputsCode(int src_count,
   if (scalar == nullptr) {
     code += "     FLT4 src_1 = src_buffer1[linear_index];";
   } else {
-    code +=
-        absl::StrCat("     FLT4 src_1 = FLT4(", std::to_string(*scalar), ");");
+    code += "     FLT4 src_1 = FLT4(" + std::to_string(*scalar) + ");";
   }
   switch (op_type) {
     case OperationType::DIV: {

From 1f5efda81c2c06359b7fb407c8dbd7bccc349546 Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Thu, 20 Feb 2020 17:20:20 +0530
Subject: [PATCH 344/442] Single quotes -> double quotes for LazyLoader

---
 tensorflow/python/keras/saving/save.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py
index cb94f336408..9a970480633 100644
--- a/tensorflow/python/keras/saving/save.py
+++ b/tensorflow/python/keras/saving/save.py
@@ -29,9 +29,11 @@ from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import keras_export
 
+# pylint:disable=g-inconsistent-quotes
 network = LazyLoader(
-    'network', globals(),
-    'tensorflow.python.keras.engine.network')
+    "network", globals(),
+    "tensorflow.python.keras.engine.network")
+# pylint:enable=g-inconsistent-quotes
 
 # pylint: disable=g-import-not-at-top
 if sys.version_info >= (3, 4):

From 946709e7cf982c1683e44456e87dea047a665aa3 Mon Sep 17 00:00:00 2001
From: nikochiko <kaustubh299792458@gmail.com>
Date: Thu, 20 Feb 2020 17:21:40 +0530
Subject: [PATCH 345/442] Make pylint comments consistent

---
 tensorflow/python/keras/saving/save.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py
index 9a970480633..71144a79e8c 100644
--- a/tensorflow/python/keras/saving/save.py
+++ b/tensorflow/python/keras/saving/save.py
@@ -29,11 +29,11 @@ from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import keras_export
 
-# pylint:disable=g-inconsistent-quotes
+# pylint: disable=g-inconsistent-quotes
 network = LazyLoader(
     "network", globals(),
     "tensorflow.python.keras.engine.network")
-# pylint:enable=g-inconsistent-quotes
+# pylint: enable=g-inconsistent-quotes
 
 # pylint: disable=g-import-not-at-top
 if sys.version_info >= (3, 4):

From 7235515e2fca9f7c58ed4f5fdefc0d9fc9bd8c0c Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Thu, 20 Feb 2020 06:53:40 -0800
Subject: [PATCH 346/442] Implement lowering of dynamic_broadcast_in_dim from
 HLO to LHLO.

PiperOrigin-RevId: 296198208
Change-Id: Iabe355fd8f87545dac6c9b537682668a2c653afb
---
 .../mlir/xla/tests/hlo-legalize-to-lhlo.mlir  | 24 ++++++
 .../xla/transforms/hlo_legalize_to_lhlo.cc    | 76 ++++++++++++++++++-
 2 files changed, 96 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
index 7ed4e97053d..4b2d76e586a 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
@@ -133,6 +133,30 @@ func @broadcast(%operand: memref<5xf32>, %result: memref<10x5xf32>) {
   return
 }
 
+// CHECK-LABEL: func @dyn_broadcast
+func @dyn_broadcast(%operand: memref<?x?xf32>) {
+  %tensor_operand = tensor_load %operand : memref<?x?xf32>
+  %shape = "compute.shape"() : () -> tensor<3xi64>
+  %tensor_result = "xla_hlo.dynamic_broadcast_in_dim"(%tensor_operand, %shape)
+      {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>}
+        : (tensor<?x?xf32>, tensor<3xi64>) -> tensor<?x?x?xf32>
+  // CHECK: %[[SHAPE:.*]] = "compute.shape"()
+  // CHECK: %[[C0:.*]] = constant 0 : index
+  // CHECK: %[[EL0:.*]] = extract_element %[[SHAPE]][%[[C0]]] : tensor<3xi64>
+  // CHECK: %[[IC0:.*]]  = index_cast %[[EL0]] : i64 to index
+  // CHECK: %[[C1:.*]] = constant 1 : index
+  // CHECK: %[[EL1:.*]] = extract_element %[[SHAPE]][%[[C1]]] : tensor<3xi64>
+  // CHECK: %[[IC1:.*]]  = index_cast %[[EL1]] : i64 to index
+  // CHECK: %[[C2:.*]] = constant 2 : index
+  // CHECK: %[[EL2:.*]] = extract_element %[[SHAPE]][%[[C2]]] : tensor<3xi64>
+  // CHECK: %[[IC2:.*]]  = index_cast %[[EL2]] : i64 to index
+  // CHECK: %[[RESULT:.*]] = alloc(%[[IC0]], %[[IC1]], %[[IC2]])
+  // CHECK-NEXT: "xla_lhlo.broadcast_in_dim"(%{{.*}}, %[[RESULT]]) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>}
+  // Do not store the value back to avoid the tensor-store being rewritten to
+  // a copy into the pre-allocated argument.
+  return
+}
+
 // CHECK-LABEL: func @iota
 func @iota(%result: memref<10xi32>) {
   %tensor_result = "xla_hlo.iota"()
diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
index 57610758bae..77c361a8ab5 100644
--- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
@@ -49,6 +49,48 @@ Operation* FindInsertionPointForCopy(Value value) {
   return nullptr;
 }
 
+Value InsertDynamicAllocAndDealloc(Location loc, Value result,
+                                   Value shape_operand,
+                                   ConversionPatternRewriter* rewriter) {
+  auto result_type = result.getType().dyn_cast<ShapedType>();
+  if (!result_type) {
+    result.getDefiningOp()->emitOpError()
+        << "tensor to buffer conversion expects ranked results";
+  }
+  auto memref_type =
+      MemRefType::get(result_type.getShape(), result_type.getElementType());
+
+  Operation* op = result.getDefiningOp();
+  auto block = op->getBlock();
+
+  // Extract the required element out of the vector.
+  SmallVector<Value, 4> dynamic_operands;
+  for (auto shape_element : llvm::enumerate(result_type.getShape())) {
+    if (shape_element.value() != ShapedType::kDynamicSize) continue;
+    Value index = rewriter->create<ConstantOp>(
+        loc, rewriter->getIntegerAttr(rewriter->getIndexType(),
+                                      shape_element.index()));
+    Value alloc_operand = rewriter->create<ExtractElementOp>(loc, shape_operand,
+                                                             ValueRange{index});
+    if (!alloc_operand.getType().isIndex()) {
+      alloc_operand = rewriter->create<IndexCastOp>(loc, alloc_operand,
+                                                    rewriter->getIndexType());
+    }
+    dynamic_operands.push_back(alloc_operand);
+  }
+
+  // Insert in front of op to ensure sizes are available.
+  OpBuilder allocBuilder(op);
+  auto alloc = allocBuilder.create<AllocOp>(loc, memref_type, dynamic_operands);
+
+  alloc.setAttr(kTempBufferAttr, rewriter->getBoolAttr(true));
+
+  allocBuilder.setInsertionPoint(block, std::prev(block->end()));
+  allocBuilder.create<DeallocOp>(loc, alloc);
+
+  return alloc;
+}
+
 Value InsertAllocAndDealloc(Location loc, Value result,
                             ConversionPatternRewriter* rewriter) {
   auto result_type = result.getType().dyn_cast<ShapedType>();
@@ -96,6 +138,30 @@ class HloToLhloOpConverter : public ConversionPattern {
   }
 };
 
+struct HloToLHloDynamicBroadcastInDimOpConverter
+    : public OpConversionPattern<xla_hlo::DynamicBroadcastInDimOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  PatternMatchResult matchAndRewrite(
+      xla_hlo::DynamicBroadcastInDimOp op, ArrayRef<Value> operands,
+      ConversionPatternRewriter& rewriter) const final {
+    auto loc = op.getLoc();
+    auto broadcast_dimensions = op.broadcast_dimensions();
+    if (!broadcast_dimensions.hasValue()) {
+      return matchFailure();
+    }
+    Value resultBuffer = InsertDynamicAllocAndDealloc(
+        loc, op.getResult(), op.output_dimensions(), &rewriter);
+    rewriter.create<xla_lhlo::BroadcastInDimOp>(
+        loc, operands[0], resultBuffer, broadcast_dimensions.getValue());
+
+    rewriter.replaceOp(op, {resultBuffer});
+
+    return matchSuccess();
+  }
+};
+
 struct HloToLHloReduceOpConverter
     : public OpConversionPattern<xla_hlo::ReduceOp> {
  public:
@@ -264,7 +330,8 @@ struct HloLegalizeToLhlo : public ModulePass<HloLegalizeToLhlo> {
     auto module = getModule();
     populateHLOToLHLOConversionPattern(module.getContext(), &patterns);
 
-    if (failed(applyFullConversion(module, target, patterns, nullptr))) {
+    // Do partial conversion so we can have unknown ops in tests.
+    if (failed(applyPartialConversion(module, target, patterns, nullptr))) {
       signalPassFailure();
     }
   }
@@ -354,7 +421,7 @@ void populateHLOToLHLOConversionPattern(MLIRContext* context,
                                         OwningRewritePatternList* patterns) {
   // clang-format off
   patterns->insert<
-      HloToLHloReduceOpConverter,
+      HloToLHloDynamicBroadcastInDimOpConverter,
       HloToLhloFuncOpConverter,
       HloToLhloOpConverter<xla_hlo::AbsOp, xla_lhlo::AbsOp>,
       HloToLhloOpConverter<xla_hlo::AddOp, xla_lhlo::AddOp>,
@@ -379,9 +446,10 @@ void populateHLOToLHLOConversionPattern(MLIRContext* context,
       HloToLhloOpConverter<xla_hlo::SignOp, xla_lhlo::SignOp>,
       HloToLhloOpConverter<xla_hlo::SubOp, xla_lhlo::SubOp>,
       HloToLhloOpConverter<xla_hlo::TanhOp, xla_lhlo::TanhOp>,
+      HloToLHloReduceOpConverter,
+      StdToLhloReturnOpConverter,
       HloToLhloTensorLoadOpConverter,
-      HloToLhloTensorStoreOpConverter,
-      StdToLhloReturnOpConverter
+      HloToLhloTensorStoreOpConverter
   >(context);
   // clang-format on
 }

From ce3da2622ccf4c7fca3a0346dc8d130723549454 Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Thu, 20 Feb 2020 08:33:13 -0800
Subject: [PATCH 347/442] Support materializing dynamic broadcast operations
 for binary operations.

This inserts a xla_hlo.dynamic_broadcast_in_dim operation for the two operands of the dynamic operation of the rank is known and a broadcast_dimensions attribute is present.

PiperOrigin-RevId: 296216205
Change-Id: Ic5e5d80ce5921be91dd6c023af32a402859f24f4
---
 tensorflow/compiler/mlir/xla/BUILD            |   2 +
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td    |  16 ++
 .../compiler/mlir/xla/mlir_hlo_to_hlo.cc      |   6 +
 .../xla/tests/materialize-broadcasts.mlir     |  36 ++++
 .../xla/transforms/materialize_broadcasts.cc  | 161 +++++++++++++++++-
 .../transforms/materialize_broadcasts_pass.cc |   3 +
 6 files changed, 218 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index 0e912a30ab0..d3b7215d26d 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -316,6 +316,7 @@ cc_library(
     deps = [
         ":hlo",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Transforms",
     ],
 )
@@ -344,6 +345,7 @@ cc_library(
         ":xla_unfuse_batch_norm",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Transforms",
     ],
     alwayslink = 1,
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index e9727798907..28c0a859f7d 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -777,6 +777,22 @@ def HLO_BroadcastInDimOp : HLO_Op<"broadcast_in_dim",
   let hasCustomHLOConverter = 1;
 }
 
+def HLO_ScalarsToDimensionTensorOp : HLO_Op<"scalars_to_dimension_tensor",
+    [SameOperandsElementType, NoSideEffect]> {
+  string summary = "Converts a sequence of scalars into a 1d tensor.";
+
+  string description = [{
+    This is a useful operation that is currently missing in Standard. Used to
+    compute shape arguments to dynamic operations.
+  }];
+
+  let arguments = (ins Variadic<AnyInteger>);
+  let results = (outs HLO_DimensionTensor);
+
+  // Cannot be exported to legacy formats.
+  let hasCustomHLOConverter = 1;
+}
+
 def HLO_DynamicBroadcastInDimOp : HLO_Op<"dynamic_broadcast_in_dim",
       [NoSideEffect]> {
   string summary = "Broadcast a tensor into the given dynamic shape by adding dimensions.";
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index c45baef855b..8fa7d809024 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -533,6 +533,12 @@ LogicalResult ExportXlaOp(BroadcastInDimOp op, OpLoweringContext ctx) {
   return success();
 }
 
+LogicalResult ExportXlaOp(ScalarsToDimensionTensorOp op,
+                          OpLoweringContext ctx) {
+  // This op has no expression in the legacy export format.
+  return failure();
+}
+
 LogicalResult ExportXlaOp(DynamicBroadcastInDimOp op, OpLoweringContext ctx) {
   // This op has no expression in the legacy export format.
   return failure();
diff --git a/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir b/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir
index 53781158d58..682b153d474 100644
--- a/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir
@@ -235,3 +235,39 @@ func @compareBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tenso
   %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xi1>
   return %0 : tensor<1x4xi1>
 }
+
+// -----
+
+// CHECK-LABEL: @dynamicBroadcastAdd
+func @dynamicBroadcastAdd(%arg0: tensor<?x?xf32>, %arg1: tensor<?xf32>) -> tensor<?x?xf32> {
+  // CHECK-NEXT: %[[DIM0:.*]] = dim %arg0, 0 : tensor<?x?xf32>
+  // CHECK-NEXT: %[[DIM0C:.*]] = index_cast %[[DIM0]] : index to i32
+  // CHECK-NEXT: %c1 = constant 1 : index
+  // CHECK-NEXT: %[[DIM1_0:.*]] = dim %arg0, 1 : tensor<?x?xf32>
+  // CHECK-NEXT: %[[DIM1_1:.*]] = dim %arg1, 0 : tensor<?xf32>
+  // CHECK-NEXT: %[[CMPI:.*]] = cmpi "eq", %[[DIM1_0]], %c1 : index
+  // CHECK-NEXT: %[[SEL:.*]] = select %[[CMPI]], %[[DIM1_0]], %[[DIM1_1]] : index
+  // CHECK-NEXT: %[[DIM1C:.*]] = index_cast %[[SEL]] : index to i32
+  // CHECK-NEXT: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM0C]], %[[DIM1C]]) : (i32, i32) -> tensor<2xi32>
+  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
+  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
+  // CHECK-NEXT: xla_hlo.add %[[BROADCAST0]], %[[BROADCAST1]] : tensor<?x?xf32>
+  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?x?xf32>, tensor<?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @dynamicBroadcastAddScalar
+func @dynamicBroadcastAddScalar(%arg0: tensor<?x?xf32>, %arg1: tensor<f32>) -> tensor<?x?xf32> {
+  // CHECK-NEXT: %[[DIM0:.*]] = dim %arg0, 0 : tensor<?x?xf32>
+  // CHECK-NEXT: %[[DIM0C:.*]] = index_cast %[[DIM0]] : index to i32
+  // CHECK-NEXT: %[[DIM1:.*]] = dim %arg0, 1 : tensor<?x?xf32>
+  // CHECK-NEXT: %[[DIM1C:.*]] = index_cast %[[DIM1]] : index to i32
+  // CHECK-NEXT: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM0C]], %[[DIM1C]]) : (i32, i32) -> tensor<2xi32>
+  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
+  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>, tensor<2xi32>) -> tensor<?x?xf32>
+  // CHECK-NEXT: xla_hlo.add %[[BROADCAST0]], %[[BROADCAST1]] : tensor<?x?xf32>
+  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
diff --git a/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc b/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
index 3ff6d374493..fbaab534565 100644
--- a/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <numeric>
 
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
 #include "mlir/IR/MLIRContext.h"  // TF:llvm-project
 #include "mlir/IR/Operation.h"  // TF:llvm-project
 #include "mlir/IR/PatternMatch.h"  // TF:llvm-project
@@ -72,10 +73,9 @@ bool CreateBroadcastsForBinaryOp(SrcOp op, PatternRewriter *rewriter,
     return false;
   }
 
-  if (!op_ranked_type.hasStaticShape()) {
-    // Dynamic result shape, can't use BroadcastInDimOp.
-    return false;
-  }
+  // Dynamic result shape, can't use BroadcastInDimOp.
+  assert(op_ranked_type.hasStaticShape() &&
+         "dynamic shape requires DynamicBroadcastInDim");
 
   auto lhs_rank = lhs_ranked_type.getRank();
   auto rhs_rank = rhs_ranked_type.getRank();
@@ -118,6 +118,144 @@ bool CreateBroadcastsForBinaryOp(SrcOp op, PatternRewriter *rewriter,
   return true;
 }
 
+// Helper template to generate code for computing the result shape of a
+// broadcasted operation. This ultimately should be subsumed by functions
+// from the shape dialect.
+// Assumes that large and small are the operand values of `op` and that they
+// have a ranked tensory type with rank(large) >= rank(small).
+template <typename SrcOp>
+std::vector<Value> ComputeBroadcastedShape(SrcOp op, Value small, Value large,
+                                           PatternRewriter *rewriter) {
+  auto loc = op.getLoc();
+  auto larger_ranked_type = large.getType().cast<RankedTensorType>();
+  auto output_rank = larger_ranked_type.getRank();
+
+  constexpr int kExpandShape = -1;
+
+  std::vector<Value> shape_values;
+  shape_values.reserve(output_rank);
+  std::vector<int> indexes(output_rank, kExpandShape);
+  DenseIntElementsAttr broadcast_dimensions =
+      op.broadcast_dimensions().getValue();
+  // Compute a mapping from output dimensions to their corresponding input
+  // dimensions in the smaller ranked operand.
+  for (auto pair : llvm::enumerate(broadcast_dimensions.getIntValues())) {
+    indexes.at(pair.value().getLimitedValue()) = pair.index();
+  }
+
+  // Compute the broadcasted shape of the result using numpy style broadcasting
+  // semantics. The result shape at a position is the shape of the larger
+  // operand at that position if the no dimension of the smaller operand is
+  // mapped to it.
+  // If both operands contribute to an output dimension, their shape has to
+  // either be the same in that dimension or it can be 1, in which case the
+  // shape of the other operand is used.
+  for (int i = 0; i < output_rank; ++i) {
+    Value index_value;
+    if (indexes[i] == kExpandShape) {
+      // The smaller shape gets expanded to the larger one in this case.
+      index_value = rewriter->create<mlir::DimOp>(loc, large, i);
+    } else {
+      // Compute the result shape depending on whether the rank of smaller is 1.
+      // This does not check that the broadcast operation actualy is correct.
+      // In particular, we do not check that both shapes are the same if the
+      // smaller ranked shape is not 1.
+      ConstantOp one = rewriter->create<mlir::ConstantOp>(
+          loc, rewriter->getIntegerAttr(rewriter->getIndexType(), 1));
+      DimOp lrg_dim = rewriter->create<mlir::DimOp>(loc, large, i);
+      DimOp sml_dim = rewriter->create<mlir::DimOp>(loc, small, indexes[i]);
+      sml_dim.dump();
+      CmpIOp compare =
+          rewriter->create<mlir::CmpIOp>(loc, CmpIPredicate::eq, lrg_dim, one);
+      index_value =
+          rewriter->create<mlir::SelectOp>(loc, compare, lrg_dim, sml_dim);
+    }
+    // Ideally, we would like to keep this on index but MLIR does not allow
+    // this.
+    shape_values.push_back(rewriter->create<mlir::IndexCastOp>(
+        loc, index_value, rewriter->getIntegerType(32)));
+  }
+
+  return shape_values;
+}
+
+// Helper function for OpRewritePattern classes to materialize dynamic
+// broadcasts on LHS and RHS arguments to a binary op.
+//
+// Returns true and set out_lhs and out_rhs for materialized dynamic broadcasts
+// for LHS and RHS arguments, else returns false.
+template <typename SrcOp>
+bool CreateDynamicBroadcastsForBinaryOp(SrcOp op, PatternRewriter *rewriter,
+                                        Value *out_lhs, Value *out_rhs) {
+  if (!op.broadcast_dimensions().hasValue()) {
+    // Note: the op may still have an implicit broadcast on it, such as
+    // for (tensor<1xf32>, tensor<4xf32>).
+    return false;
+  }
+
+  // Insert BroadcastInDimOps for the left-hand-side and right-hand-side args,
+  // replacing the original LHS and RHS args in the source op with the results
+  // of the broadcasts.
+  Value lhs = op.lhs();
+  Value rhs = op.rhs();
+
+  auto lhs_ranked_type = lhs.getType().dyn_cast<RankedTensorType>();
+  auto rhs_ranked_type = rhs.getType().dyn_cast<RankedTensorType>();
+  if (!lhs_ranked_type || !rhs_ranked_type) {
+    // Unranked, can't determine at this point how to perform the broadcast.
+    return false;
+  }
+
+  auto lhs_rank = lhs_ranked_type.getRank();
+  auto rhs_rank = rhs_ranked_type.getRank();
+
+  // Set broadcast_dimensions to [0, ..., rank] for the higher rank arg.
+  // Use the original op.broadcast_dimensions for the lower rank arg.
+  auto higher_rank_broadcast_dims =
+      GetI64ElementsAttrForSeq(0, std::max(lhs_rank, rhs_rank), rewriter);
+  DenseIntElementsAttr lhs_broadcast_dims;
+  DenseIntElementsAttr rhs_broadcast_dims;
+  std::vector<Value> shape_elements;
+  if (lhs_rank > rhs_rank) {
+    lhs_broadcast_dims = higher_rank_broadcast_dims;
+    rhs_broadcast_dims = op.broadcast_dimensions().getValue();
+    shape_elements = ComputeBroadcastedShape<SrcOp>(op, rhs, lhs, rewriter);
+  } else if (lhs_rank < rhs_rank) {
+    lhs_broadcast_dims = op.broadcast_dimensions().getValue();
+    rhs_broadcast_dims = higher_rank_broadcast_dims;
+    shape_elements = ComputeBroadcastedShape<SrcOp>(op, lhs, rhs, rewriter);
+  } else {
+    // This shouldn't happen for legal ops. If the broadcast_dimensions
+    // attribute is set, the ranks should be different.
+    // TODO(scotttodd): Add a custom verification for ops and assert here.
+    return false;
+  }
+
+  // DynamicBroadcastInDimOp preserves the element type but produces a tensor
+  // with unranked shape. The rank of the output is the length of the
+  // output shape argument.
+  SmallVector<int64_t, 4> op_shape(shape_elements.size(),
+                                   RankedTensorType::kDynamicSize);
+  auto lhs_type =
+      RankedTensorType::get(op_shape, lhs_ranked_type.getElementType());
+  auto rhs_type =
+      RankedTensorType::get(op_shape, rhs_ranked_type.getElementType());
+
+  // We need a way to turn a list of scalars into a vector. While Standard
+  // dialect does not have one, use the XLA_HLO variant.
+  int shape_size = shape_elements.size();
+  Type shape_element_type = shape_elements.front().getType();
+  Value shape_value = rewriter->create<ScalarsToDimensionTensorOp>(
+      op.getLoc(), RankedTensorType::get({shape_size}, shape_element_type),
+      shape_elements);
+
+  *out_lhs = rewriter->createOrFold<DynamicBroadcastInDimOp>(
+      op.getLoc(), lhs_type, lhs, shape_value, lhs_broadcast_dims);
+  *out_rhs = rewriter->createOrFold<DynamicBroadcastInDimOp>(
+      op.getLoc(), rhs_type, rhs, shape_value, rhs_broadcast_dims);
+  return true;
+}
+
 template <typename SrcOp>
 struct BinaryOpWithBroadcastConvert : public OpRewritePattern<SrcOp> {
   explicit BinaryOpWithBroadcastConvert(MLIRContext *context)
@@ -127,8 +265,19 @@ struct BinaryOpWithBroadcastConvert : public OpRewritePattern<SrcOp> {
                                      PatternRewriter &rewriter) const override {
     Value new_lhs;
     Value new_rhs;
-    if (!CreateBroadcastsForBinaryOp(op, &rewriter, &new_lhs, &new_rhs)) {
-      return this->matchFailure();
+
+    auto op_ranked_type = op.getType().template dyn_cast<RankedTensorType>();
+    if (!op_ranked_type) return this->matchFailure();
+
+    if (op_ranked_type.hasStaticShape()) {
+      if (!CreateBroadcastsForBinaryOp(op, &rewriter, &new_lhs, &new_rhs)) {
+        return this->matchFailure();
+      }
+    } else {
+      if (!CreateDynamicBroadcastsForBinaryOp(op, &rewriter, &new_lhs,
+                                              &new_rhs)) {
+        return this->matchFailure();
+      }
     }
 
     // Replace the original op with a new one that uses the new args.
diff --git a/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts_pass.cc b/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts_pass.cc
index 933f8a73fd5..596b67f0eed 100644
--- a/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts_pass.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts_pass.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
 #include "mlir/IR/MLIRContext.h"  // TF:llvm-project
 #include "mlir/IR/Operation.h"  // TF:llvm-project
 #include "mlir/IR/PatternMatch.h"  // TF:llvm-project
@@ -34,6 +35,8 @@ struct TestMaterializeBroadcastsPass
 
     // Consider the xla_hlo dialect legal for tests.
     conversionTarget.addLegalDialect<XlaHloDialect>();
+    // The conversion uses helpers from the Standard dialect.
+    conversionTarget.addLegalDialect<mlir::StandardOpsDialect>();
 
     SetupMaterializeBroadcastsLegality(&getContext(), &conversionTarget);
     PopulateMaterializeBroadcastsPatterns(&getContext(), &conversionPatterns);

From d880414b2f9b9b0517695a654bc0f2dac21346dc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 20 Feb 2020 08:55:01 -0800
Subject: [PATCH 348/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 296220963 Change-Id:
 I740d8c4b671dc71c987dcd9c81150f8838a64f33

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index ecdce1e627b..449a95765a5 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21345,7 +21345,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22053,7 +22053,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22249,7 +22249,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22318,7 +22318,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22433,7 +22433,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22492,7 +22492,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22666,7 +22666,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22857,7 +22857,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25297,7 +25297,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25629,7 +25629,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25679,7 +25679,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25929,7 +25929,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26559,7 +26559,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27624,7 +27624,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45536,7 +45536,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 6bf28952981415795ae36784c8c17b08218a4b5c Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Thu, 20 Feb 2020 09:15:32 -0800
Subject: [PATCH 349/442] Reduce overhead of protecting tensors for eager

The eager executor tried to prevent forwarding of any input tensors by
incrementing the reference count of any "non-consumed" inputs. This
involved highly delicate logic which first signaled "non-consumed"
inputs as those with a reference count greater than 1 (1 from python and
another from the EagerOperation class), which require "protecting" by
incrementing underlying tensor buffer. This logic is highly heavyweight
for the common case of synchronous execution. We thus simplify the logic
by having all TensorHandle Tensors protected at construction and
"unprotect" then if the reference count is 1.

- Hold 2 reference counts a TensorHandle's backing Tensor. This protects
  the Tensor from being forwarded.
- Add the ability to unprotect a TensorHandle's backing Tensor when the
  reference count is 1.
- Split ExecuteNode into Async implementation. The sync ExecuteNode
  class can avoid various copies such as the list of inputs and the
  forwarding map.
- Remove the experimental TFE_OpConsumeInput API. Input forwarding can
  be achieved by releasing the handle after calling TFE_OpAddInput as
  demonstrated by the added tests.
- Fix TF_AllocateTensor to return a forwardable tensor it was previously
  disabled due to re-using the logic in TF_NewTensor.
- Save mirror tensor when calling TFE_TensorHandleResolve.

PiperOrigin-RevId: 296225251
Change-Id: I484cfccbef8b44e82757b8bda0981cd7fd2f8096
---
 tensorflow/c/eager/c_api.cc                   |  18 ++-
 tensorflow/c/eager/c_api_experimental.cc      |   4 -
 tensorflow/c/eager/c_api_experimental.h       |   3 -
 tensorflow/c/eager/c_api_test.cc              |  87 +++++++++++++
 tensorflow/c/eager/c_api_test_util.cc         |  15 +++
 tensorflow/c/eager/c_api_test_util.h          |   3 +
 tensorflow/c/eager/operation_interface.cc     |   7 --
 tensorflow/c/eager/operation_interface.h      |   4 -
 tensorflow/c/tf_tensor.cc                     |  50 +++++---
 tensorflow/c/tf_tensor_internal.h             |   9 +-
 tensorflow/core/common_runtime/eager/BUILD    |   2 +
 .../common_runtime/eager/eager_operation.h    |  15 +--
 .../core/common_runtime/eager/execute.cc      |  15 ++-
 .../core/common_runtime/eager/execute.h       |   4 +-
 .../core/common_runtime/eager/execute_node.cc |  28 -----
 .../core/common_runtime/eager/execute_node.h  | 118 +++++++++++++-----
 .../common_runtime/eager/tensor_handle.cc     |  25 +++-
 .../core/common_runtime/eager/tensor_handle.h |   3 +-
 .../eager/tensor_handle_data.cc               |  10 ++
 .../common_runtime/eager/tensor_handle_data.h |  15 ++-
 .../eager/remote_tensor_handle_data.cc        |   8 ++
 .../eager/remote_tensor_handle_data.h         |   2 +
 22 files changed, 318 insertions(+), 127 deletions(-)

diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 6e2b24502c7..b6a87cc616d 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -1116,9 +1116,13 @@ TF_Tensor* tensorflow::TensorHandleInterface::Resolve(Status* status) {
     return retval;
   } else {
     tensorflow::Tensor tensor;
-    if (IsCPU(handle_->device())) {
+    if (IsCPU(handle_->device()) || handle_->HasLocalMirror(nullptr)) {
       const tensorflow::Tensor* src = nullptr;
-      *status = handle_->Tensor(&src);
+      if (handle_->HasLocalMirror(nullptr)) {
+        *status = handle_->TensorFromDevice(nullptr, &src);
+      } else {
+        *status = handle_->Tensor(&src);
+      }
       if (!status->ok()) return nullptr;
       tensor = *src;
     } else {
@@ -1126,6 +1130,13 @@ TF_Tensor* tensorflow::TensorHandleInterface::Resolve(Status* status) {
       CHECK_NE(ctx, nullptr);
       *status = handle_->CopyToDevice(*ctx, ctx->HostCPU(), &tensor);
       if (!status->ok()) return nullptr;
+      if (handle_->ImplicitMirroring()) {
+        *status = handle_->AddEmptyLocalMirror(nullptr);
+        if (!status->ok()) return nullptr;
+        Tensor mirror = tensor;
+        *status = handle_->SetTensor(std::move(mirror), nullptr);
+        if (!status->ok()) return nullptr;
+      }
     }
     return tensorflow::TF_TensorFromTensor(tensor, status);
   }
@@ -1193,7 +1204,8 @@ TFE_TensorHandle* TFE_NewTensorHandleFromDeviceMemory(
   // TODO(apassos) do we need to wrap the deallocator here to make sure to sync
   // the device?
   TF_ManagedBuffer* buf =
-      new TF_ManagedBuffer(data, len, deallocator, deallocator_arg);
+      new TF_ManagedBuffer(data, len, deallocator, deallocator_arg,
+                           /*owns_memory=*/false);
 
   tensorflow::Tensor t(static_cast<tensorflow::DataType>(dtype),
                        tensorflow::TensorShape(dimvec), buf);
diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index 4ed9194c554..afa36fe1210 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -39,10 +39,6 @@ void TFE_OpReset(TFE_Op* op_to_reset, const char* op_or_function_name,
   }
 }
 
-void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
-  status->status = op->operation->ConsumeInput(h);
-}
-
 void TFE_ContextEnableGraphCollection(TFE_Context* ctx) {
   ctx->context->SetShouldStoreGraphs(true);
 }
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index da27bc51360..92dab6a36c6 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -34,9 +34,6 @@ TF_CAPI_EXPORT extern void TFE_OpReset(TFE_Op* op_to_reset,
                                        const char* raw_device_name,
                                        TF_Status* status);
 
-TF_CAPI_EXPORT extern void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h,
-                                              TF_Status* status);
-
 // Enables only graph collection in RunMetadata on the functions executed from
 // this context.
 TF_CAPI_EXPORT extern void TFE_ContextEnableGraphCollection(TFE_Context* ctx);
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 2bffe783097..04060b13885 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <string.h>
 
+#include <string>
+
 #include "absl/strings/match.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
 #include "tensorflow/c/eager/c_api_internal.h"
@@ -583,6 +585,91 @@ TEST(CAPI, TensorHandleDevices) {
   TFE_DeleteContext(ctx);
 }
 
+void ExecuteAdd(bool async, bool forward_input) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* n = TestMatrixTensorHandle100x100();
+  // If a GPU exists, copy the handle to GPU so that we can exercise
+  // unprotecting a mirror.
+  std::string gpu_device_name;
+  if (GetDeviceName(ctx, &gpu_device_name, "GPU")) {
+    TFE_TensorHandle* n_gpu =
+        TFE_TensorHandleCopyToDevice(n, ctx, gpu_device_name.c_str(), status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_TensorHandleEnableImplicitMirroring(n_gpu, status);
+    TFE_DeleteTensorHandle(n);
+    n = n_gpu;
+  }
+
+  TFE_TensorHandle* m = TestMatrixTensorHandle100x100();
+
+  // Store pointer to raw buffer for validation of forwarding behaviour.
+  TF_Tensor* orig = TFE_TensorHandleResolve(n, status);
+  void* orig_ptr = TF_TensorData(orig);
+  TF_DeleteTensor(orig);
+
+  TFE_Op* add_op = AddOp(ctx, n, m);
+  std::string cpu_device_name;
+  ASSERT_TRUE(GetDeviceName(ctx, &cpu_device_name, "CPU"));
+  TFE_OpSetDevice(add_op, cpu_device_name.c_str(), status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  if (forward_input) {
+    TFE_DeleteTensorHandle(n);
+  }
+
+  int num_retvals = 1;
+
+  if (async) {
+    // Enqueue dummy ops so we backlog async execution & actually test async.
+    for (int i = 0; i < 10000; ++i) {
+      TFE_TensorHandle* dummy = nullptr;
+      TFE_Execute(add_op, &dummy, &num_retvals, status);
+      ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+      TFE_DeleteTensorHandle(dummy);
+    }
+  }
+
+  TFE_TensorHandle* retval = nullptr;
+  TFE_Execute(add_op, &retval, &num_retvals, status);
+  EXPECT_EQ(1, num_retvals);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  if (!forward_input) {
+    TFE_DeleteTensorHandle(n);
+  }
+  TFE_DeleteOp(add_op);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(retval, status);
+  if (forward_input || async) {
+    EXPECT_EQ(orig_ptr, TF_TensorData(t));
+  } else {
+    EXPECT_NE(orig_ptr, TF_TensorData(t));
+  }
+
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteTensorHandle(m);
+  TFE_DeleteTensorHandle(retval);
+  TFE_DeleteContext(ctx);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  float result[100 * 100] = {0};
+  EXPECT_EQ(sizeof(result), TF_TensorByteSize(t));
+  memcpy(&result[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  for (int i = 0; i < 100 * 100; ++i) {
+    EXPECT_EQ(2.0f, result[i]);
+  }
+  TF_DeleteStatus(status);
+}
+TEST(CAPI, ExecuteAdd) { ExecuteAdd(false, false); }
+TEST(CAPI, ExecuteAddAsync) { ExecuteAdd(true, false); }
+TEST(CAPI, ExecuteAddForward) { ExecuteAdd(false, true); }
+TEST(CAPI, ExecuteAddForwardAsync) { ExecuteAdd(true, true); }
+
 void Execute_MatMul_CPU(bool async) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
diff --git a/tensorflow/c/eager/c_api_test_util.cc b/tensorflow/c/eager/c_api_test_util.cc
index 51566b35a9f..bee76fe296f 100644
--- a/tensorflow/c/eager/c_api_test_util.cc
+++ b/tensorflow/c/eager/c_api_test_util.cc
@@ -131,6 +131,21 @@ TFE_TensorHandle* TestMatrixTensorHandle3X2() {
   return th;
 }
 
+TFE_Op* AddOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b) {
+  TF_Status* status = TF_NewStatus();
+
+  TFE_Op* op = TFE_NewOp(ctx, "AddV2", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, a, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, b, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteStatus(status);
+  TFE_OpSetAttrType(op, "T", TFE_TensorHandleDataType(a));
+
+  return op;
+}
+
 TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b) {
   TF_Status* status = TF_NewStatus();
 
diff --git a/tensorflow/c/eager/c_api_test_util.h b/tensorflow/c/eager/c_api_test_util.h
index 28062222cf0..2c2f8323363 100644
--- a/tensorflow/c/eager/c_api_test_util.h
+++ b/tensorflow/c/eager/c_api_test_util.h
@@ -42,6 +42,9 @@ TFE_TensorHandle* DoubleTestMatrixTensorHandle3X2();
 // Return a tensor handle containing a 3x2 matrix of floats
 TFE_TensorHandle* TestMatrixTensorHandle3X2();
 
+// Return an add op multiplying `a` by `b`.
+TFE_Op* AddOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b);
+
 // Return a matmul op multiplying `a` by `b`.
 TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b);
 
diff --git a/tensorflow/c/eager/operation_interface.cc b/tensorflow/c/eager/operation_interface.cc
index ce62590fd51..5703d3231bd 100644
--- a/tensorflow/c/eager/operation_interface.cc
+++ b/tensorflow/c/eager/operation_interface.cc
@@ -309,11 +309,4 @@ Status OperationInterface::SetUseXla(bool enable) {
   return Status::OK();
 }
 
-Status OperationInterface::ConsumeInput(TFE_TensorHandle* h) {
-  auto handle =
-      tensorflow::down_cast<TensorHandleInterface*>(h->handle.get())->Handle();
-  operation_.ConsumeInput(handle);
-  return Status::OK();
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/c/eager/operation_interface.h b/tensorflow/c/eager/operation_interface.h
index 189d4b4e333..900c5112c08 100644
--- a/tensorflow/c/eager/operation_interface.h
+++ b/tensorflow/c/eager/operation_interface.h
@@ -99,9 +99,6 @@ class AbstractOperationInterface {
   virtual tensorflow::Status SetUseXla(bool enable) {
     return tensorflow::errors::Unimplemented("SetUseXla not implemented");
   }
-  virtual tensorflow::Status ConsumeInput(TFE_TensorHandle* h) {
-    return tensorflow::errors::Unimplemented("ConsumeInput not implemented");
-  }
   virtual tensorflow::Status SetCancellationManager(
       TFE_CancellationManager* cancellation_manager) {
     return tensorflow::errors::Unimplemented(
@@ -172,7 +169,6 @@ class OperationInterface : public AbstractOperationInterface {
   Status OutputLength(const char* output_name, int* length) override;
 
   Status SetUseXla(bool enable) override;
-  Status ConsumeInput(TFE_TensorHandle* h) override;
   Status SetCancellationManager(
       TFE_CancellationManager* cancellation_manager) override;
 
diff --git a/tensorflow/c/tf_tensor.cc b/tensorflow/c/tf_tensor.cc
index 6bb2cafbbc5..4e75beceb3e 100644
--- a/tensorflow/c/tf_tensor.cc
+++ b/tensorflow/c/tf_tensor.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/c/tf_tensor.h"
 
 #include <memory>
+#include <vector>
 
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_status_helper.h"
@@ -64,25 +65,41 @@ void deallocate_buffer(void* data, size_t len, void* arg) {
 }
 }  // namespace tensorflow
 
+namespace {
+TF_Tensor* CreateTensor(TF_ManagedBuffer* buf, TF_DataType dtype,
+                        const int64_t* dims, int num_dims, size_t len) {
+  std::vector<tensorflow::int64> dimvec(num_dims);
+  for (int i = 0; i < num_dims; ++i) {
+    dimvec[i] = static_cast<tensorflow::int64>(dims[i]);
+  }
+
+  // TODO(gjn): Make the choice of interface a compile-time configuration.
+  tensorflow::TensorInterface ret(
+      Tensor(static_cast<tensorflow::DataType>(dtype),
+             tensorflow::TensorShape(dimvec), buf));
+  buf->Unref();
+  size_t elem_size = TF_DataTypeSize(dtype);
+  if (elem_size > 0 && len < (elem_size * ret.NumElements())) {
+    return nullptr;
+  }
+  return new TF_Tensor{std::make_unique<tensorflow::TensorInterface>(ret)};
+}
+}  // namespace
 
 TF_Tensor* TF_AllocateTensor(TF_DataType dtype, const int64_t* dims,
                              int num_dims, size_t len) {
   void* data = tensorflow::allocate_tensor("TF_AllocateTensor", len,
                                            tensorflow::cpu_allocator());
-  return TF_NewTensor(dtype, dims, num_dims, data, len,
-                      tensorflow::deallocate_buffer,
-                      tensorflow::cpu_allocator());
+  TF_ManagedBuffer* buf =
+      new TF_ManagedBuffer(data, len, tensorflow::deallocate_buffer,
+                           tensorflow::cpu_allocator(), /*owns_memory=*/true);
+  return CreateTensor(buf, dtype, dims, num_dims, len);
 }
 
 TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
                         void* data, size_t len,
                         void (*deallocator)(void* data, size_t len, void* arg),
                         void* deallocator_arg) {
-  std::vector<tensorflow::int64> dimvec(num_dims);
-  for (int i = 0; i < num_dims; ++i) {
-    dimvec[i] = static_cast<tensorflow::int64>(dims[i]);
-  }
-
   TF_ManagedBuffer* buf = nullptr;
   if (dtype != TF_STRING && dtype != TF_RESOURCE &&
       tensorflow::DataTypeCanUseMemcpy(
@@ -97,24 +114,17 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
     // Other types have the same representation, so copy only if it is safe to
     // do so.
     buf = new TF_ManagedBuffer(tensorflow::allocate_tensor("TF_NewTensor", len),
-                               len, tensorflow::deallocate_buffer, nullptr);
+                               len, tensorflow::deallocate_buffer, nullptr,
+                               /*owns_memory=*/true);
     std::memcpy(buf->data(), data, len);
     // Free the original buffer.
     deallocator(data, len, deallocator_arg);
   } else {
-    buf = new TF_ManagedBuffer(data, len, deallocator, deallocator_arg);
+    buf = new TF_ManagedBuffer(data, len, deallocator, deallocator_arg,
+                               /*owns_memory=*/false);
   }
 
-  // TODO(gjn): Make the choice of interface a compile-time configuration.
-  tensorflow::TensorInterface ret(
-      Tensor(static_cast<tensorflow::DataType>(dtype),
-             tensorflow::TensorShape(dimvec), buf));
-  buf->Unref();
-  size_t elem_size = TF_DataTypeSize(dtype);
-  if (elem_size > 0 && len < (elem_size * ret.NumElements())) {
-    return nullptr;
-  }
-  return new TF_Tensor{std::make_unique<tensorflow::TensorInterface>(ret)};
+  return CreateTensor(buf, dtype, dims, num_dims, len);
 }
 
 TF_Tensor* TF_TensorMaybeMove(TF_Tensor* t) {
diff --git a/tensorflow/c/tf_tensor_internal.h b/tensorflow/c/tf_tensor_internal.h
index 7ce6e637b2b..08a55f26a83 100644
--- a/tensorflow/c/tf_tensor_internal.h
+++ b/tensorflow/c/tf_tensor_internal.h
@@ -38,11 +38,12 @@ class TF_ManagedBuffer : public tensorflow::TensorBuffer {
  public:
   TF_ManagedBuffer(void* data, size_t len,
                    void (*deallocator)(void* data, size_t len, void* arg),
-                   void* deallocator_arg)
+                   void* deallocator_arg, bool owns_memory)
       : TensorBuffer(data),
         len_(len),
         deallocator_(deallocator),
-        deallocator_arg_(deallocator_arg) {}
+        deallocator_arg_(deallocator_arg),
+        owns_memory_(owns_memory) {}
 
   ~TF_ManagedBuffer() override {
     (*deallocator_)(data(), len_, deallocator_arg_);
@@ -57,13 +58,13 @@ class TF_ManagedBuffer : public tensorflow::TensorBuffer {
     proto->set_allocator_name(tensorflow::cpu_allocator()->Name());
   }
 
-  // Prevents input forwarding from mutating this buffer.
-  bool OwnsMemory() const override { return false; }
+  bool OwnsMemory() const override { return owns_memory_; }
 
  private:
   const size_t len_;
   void (*const deallocator_)(void* data, size_t len, void* arg);
   void* const deallocator_arg_;
+  bool owns_memory_;
 };
 
 namespace tensorflow {
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index c5bde68da02..76e34173459 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -110,6 +110,7 @@ tf_cuda_library(
         "//tensorflow/core:framework",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:platform_port",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:variant",
     ],
@@ -338,6 +339,7 @@ cc_library(
         ":kernel_and_device",
         ":tensor_handle",
         ":process_function_library_runtime",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index 0261818ac96..524edf4b21f 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
 
+#include "absl/container/inlined_vector.h"
 #include "absl/types/optional.h"
 #include "absl/types/variant.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
@@ -61,12 +62,13 @@ class EagerOperation {
   const AttrBuilder& Attrs() const { return attrs_; }
   const tensorflow::OpDef* OpDef() const { return op_def_; }
 
-  const gtl::InlinedVector<TensorHandle*, 4>& Inputs() const { return inputs_; }
-  gtl::InlinedVector<TensorHandle*, 4>* MutableInputs() { return &inputs_; }
+  const absl::InlinedVector<TensorHandle*, 4>& Inputs() const {
+    return inputs_;
+  }
+  absl::InlinedVector<TensorHandle*, 4>* MutableInputs() { return &inputs_; }
 
   void AddInput(TensorHandle* h);
   void UpdateInput(int i, TensorHandle* h);
-  void ConsumeInput(TensorHandle* h);
 
   const string& Name() const { return attrs_.op_name(); }
   const AttrTypeMap* AttrTypes() const { return attr_types_; }
@@ -140,7 +142,7 @@ class EagerOperation {
   tensorflow::EagerContext& ctx_;
   AttrBuilder attrs_;
   const AttrTypeMap* attr_types_;
-  gtl::InlinedVector<TensorHandle*, 4> inputs_;
+  absl::InlinedVector<TensorHandle*, 4> inputs_;
   absl::variant<tensorflow::Device*, tensorflow::CustomDevice*> device_;
   string raw_device_name_;
   string device_name_;
@@ -173,11 +175,6 @@ inline void EagerOperation::UpdateInput(int i, TensorHandle* h) {
     *slot = h;  // Update inputs_[i] to h
   }
 }
-
-inline void EagerOperation::ConsumeInput(TensorHandle* h) {
-  inputs_.push_back(h);
-  attrs_.NumInputs(static_cast<int>(inputs_.size()));
-}
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 0d57a1dfe0e..bc1bf9c1610 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
 
+#include "absl/container/inlined_vector.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/optional.h"
@@ -64,7 +65,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
-#include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -583,20 +583,19 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
 
   Status s;
   if (async) {
-    auto node = absl::make_unique<ExecuteNode>(
+    auto node = absl::make_unique<AsyncExecuteNode>(
         &ctx, op->Inputs(), op->remote_func_params(), std::move(kernel),
         graph_collector, output_dtypes, op->GetCancellationManager(),
-        executor.Async(), absl::Span<TensorHandle*>(retvals, num_outputs));
+        absl::Span<TensorHandle*>(retvals, num_outputs));
     // For async mode, execution order will make sure that all
     // input handles are ready before executing them.
     // TODO(b/137118203): Consider executing "cheap" kernels inline for
     // performance.
     s = executor.AddOrExecute(std::move(node));
   } else {
-    ExecuteNode node(&ctx, op->Inputs(), op->remote_func_params(),
-                     std::move(kernel), graph_collector, output_dtypes,
-                     op->GetCancellationManager(), executor.Async(),
-                     {retvals, num_outputs});
+    ExecuteNode node(&ctx, op->Inputs(), op->remote_func_params(), kernel,
+                     graph_collector, output_dtypes,
+                     op->GetCancellationManager(), {retvals, num_outputs});
     s = executor.SyncExecute(&node);
   }
   // Since the operation failed, we need to Unref any outputs that were
@@ -978,7 +977,7 @@ Status EagerExecute(EagerOperation* op, TensorHandle** retvals,
 
 // TODO(gjn): Consider moving into ExecuteNode class
 Status EagerKernelExecute(
-    EagerContext* ctx, const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
+    EagerContext* ctx, const absl::InlinedVector<TensorHandle*, 4>& op_inputs,
     const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
     const core::RefCountPtr<KernelAndDevice>& kernel,
     GraphCollector* graph_collector, CancellationManager* cancellation_manager,
diff --git a/tensorflow/core/common_runtime/eager/execute.h b/tensorflow/core/common_runtime/eager/execute.h
index cc29bb9d898..8ed8b9555e3 100644
--- a/tensorflow/core/common_runtime/eager/execute.h
+++ b/tensorflow/core/common_runtime/eager/execute.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_H_
 
+#include "absl/container/inlined_vector.h"
 #include "absl/types/span.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
@@ -23,7 +24,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/inlined_vector.h"
 
 namespace tensorflow {
 
@@ -48,7 +48,7 @@ Status EagerExecute(EagerOperation* op, TensorHandle** retvals,
 // Low-level utility to execute the kernel specified by `kernel` on
 // `kernel->device()`, with the inputs op_inputs, in the context 'ctx'.
 Status EagerKernelExecute(
-    EagerContext* ctx, const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
+    EagerContext* ctx, const absl::InlinedVector<TensorHandle*, 4>& op_inputs,
     const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
     const core::RefCountPtr<KernelAndDevice>& kernel,
     GraphCollector* graph_collector, CancellationManager* cancellation_manager,
diff --git a/tensorflow/core/common_runtime/eager/execute_node.cc b/tensorflow/core/common_runtime/eager/execute_node.cc
index 8b1d03a0935..c053420fe83 100644
--- a/tensorflow/core/common_runtime/eager/execute_node.cc
+++ b/tensorflow/core/common_runtime/eager/execute_node.cc
@@ -26,8 +26,6 @@ Status ExecuteNodeArgs::Init(
   // below when we insert a copy of the Tensor into protected_tensors, and will
   // be decremented once execution is complete.
   const int n_inputs = op_inputs.size();
-  int num_protected_tensors = 0;
-  int first_index_that_needs_protecting = -1;  // Used to avoid second loop
   if (n_inputs > 0) {
     TensorHandle* const* op_inputs_array = &op_inputs[0];
     TensorValue* tensor_args_array = &tensor_args_[0];
@@ -37,33 +35,12 @@ Status ExecuteNodeArgs::Init(
         TF_RETURN_IF_ERROR(
             in->TensorValue(&tensor_args_array[i],
                             ctx->CanonicalDevice(kernel->InputDevice(i))));
-        if (!in->RefCountIsOne()) {
-          if (first_index_that_needs_protecting < 0) {
-            first_index_that_needs_protecting = i;
-          }
-          ++num_protected_tensors;
-        }
       } else {
         if (!has_remote_inputs_) {
           has_remote_inputs_ = true;
         }
       }
     }
-
-    protected_tensors_.reserve(num_protected_tensors);
-    if (first_index_that_needs_protecting >= 0) {
-      for (int i = first_index_that_needs_protecting;
-           num_protected_tensors && (i < n_inputs); ++i) {
-        TensorHandle* in = op_inputs_array[i];
-        if (!in->IsRemote() && !in->RefCountIsOne()) {
-          const Tensor* input_tensor = nullptr;
-          TF_RETURN_IF_ERROR(op_inputs_array[i]->TensorFromDevice(
-              ctx->CanonicalDevice(kernel->InputDevice(i)), &input_tensor));
-          protected_tensors_.emplace_back(TensorReference(*input_tensor));
-          --num_protected_tensors;
-        }
-      }
-    }
   }
 
   if (has_remote_inputs_) {
@@ -91,9 +68,4 @@ Status ExecuteNodeArgs::Init(
   return Status::OK();
 }
 
-ExecuteNodeArgs::~ExecuteNodeArgs() {
-  for (const auto& tensor_ref : protected_tensors_) {
-    tensor_ref.Unref();
-  }
-}
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/execute_node.h b/tensorflow/core/common_runtime/eager/execute_node.h
index 2dee244bc61..7e5340575c9 100644
--- a/tensorflow/core/common_runtime/eager/execute_node.h
+++ b/tensorflow/core/common_runtime/eager/execute_node.h
@@ -19,10 +19,14 @@ limitations under the License.
 // Required for IS_MOBILE_PLATFORM
 #include <cstddef>
 #include <memory>
+#include <string>
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
 
+#include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
+#include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
@@ -34,7 +38,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/distributed_runtime/eager/remote_mgr.h"
@@ -46,10 +49,9 @@ namespace tensorflow {
 class ExecuteNodeArgs : public EagerKernelArgs {
  public:
   explicit ExecuteNodeArgs(int count) : EagerKernelArgs(count) {}
-  ~ExecuteNodeArgs() override;
 
   Status Init(EagerContext* ctx,
-              const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
+              const absl::InlinedVector<TensorHandle*, 4>& op_inputs,
               const core::RefCountPtr<KernelAndDevice>& kernel);
 
   bool HasRemoteInputs() const override { return has_remote_inputs_; };
@@ -63,7 +65,6 @@ class ExecuteNodeArgs : public EagerKernelArgs {
 
  private:
   bool has_remote_inputs_ = false;
-  TensorReferenceVector protected_tensors_;
 #if !defined(IS_MOBILE_PLATFORM)
   std::function<Status(const int, eager::RemoteTensorHandle*)>
       serialize_remote_handle_;
@@ -73,11 +74,64 @@ class ExecuteNodeArgs : public EagerKernelArgs {
 class ExecuteNode : public EagerNode {
  public:
   ExecuteNode(
-      EagerContext* ctx, const gtl::InlinedVector<TensorHandle*, 4>& inputs,
+      EagerContext* ctx, const absl::InlinedVector<TensorHandle*, 4>& inputs,
+      const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
+      const core::RefCountPtr<KernelAndDevice>& kernel,
+      GraphCollector* graph_collector, const DataTypeVector& output_dtypes,
+      CancellationManager* cancellation_manager,
+      absl::Span<TensorHandle*> retvals)
+      : EagerNode(),
+        ctx_(ctx),
+        inputs_(inputs),
+        remote_func_params_(remote_func_params),
+        kernel_(kernel),
+        graph_collector_(graph_collector),
+        cancellation_manager_(cancellation_manager),
+        retvals_(retvals) {}
+
+  Status Run() override {
+    int i = 0;
+    for (TensorHandle* h : inputs_) {
+      if (h->RefCountIsOne()) {
+        const Device* d = ctx_->CanonicalDevice(kernel_->InputDevice(i));
+        Status s = h->Unprotect(d);
+        if (!s.ok()) {
+          VLOG(1) << "Unable to unprotect tensor: " << s;
+        }
+      }
+      ++i;
+    }
+    return EagerKernelExecute(ctx_, inputs_, remote_func_params_, kernel_,
+                              graph_collector_, cancellation_manager_,
+                              retvals_);
+  }
+
+  void Abort(Status status) override {}
+
+  std::string DebugString() const override {
+    std::string out = "[ExecuteNode]";
+    strings::StrAppend(&out, " kernel: ", kernel_->name());
+    return out;
+  }
+
+ private:
+  EagerContext* ctx_;
+  const absl::InlinedVector<TensorHandle*, 4>& inputs_;
+  const absl::optional<EagerRemoteFunctionParams>& remote_func_params_;
+  const core::RefCountPtr<KernelAndDevice>& kernel_;
+  GraphCollector* graph_collector_;
+  CancellationManager* const cancellation_manager_;
+  absl::Span<TensorHandle*> retvals_;
+};
+
+class AsyncExecuteNode : public EagerNode {
+ public:
+  AsyncExecuteNode(
+      EagerContext* ctx, const absl::InlinedVector<TensorHandle*, 4>& inputs,
       const absl::optional<EagerRemoteFunctionParams>& remote_func_params,
       core::RefCountPtr<KernelAndDevice> kernel,
       GraphCollector* graph_collector, const DataTypeVector& output_dtypes,
-      CancellationManager* cancellation_manager, bool async,
+      CancellationManager* cancellation_manager,
       absl::Span<TensorHandle*> retvals)
       : EagerNode(),
         ctx_(ctx),
@@ -85,40 +139,43 @@ class ExecuteNode : public EagerNode {
         remote_func_params_(remote_func_params),
         kernel_(std::move(kernel)),
         graph_collector_(graph_collector),
-        cancellation_manager_(cancellation_manager),
-        async_(async) {
+        cancellation_manager_(cancellation_manager) {
     // Copy the output handles, since the container for them might get
     // destroyed.
     for (auto handle : retvals) {
+      handle->Ref();
       retvals_.push_back(handle);
     }
 
-    if (async_) {
-      // This is required to ensure that the tensor handles stay alive across
-      // the execution.
-      for (auto handle : inputs_) {
-        handle->Ref();
-      }
-
-      for (auto handle : retvals_) {
-        handle->Ref();
-      }
+    // This is required to ensure that the tensor handles stay alive across
+    // the execution.
+    for (auto handle : inputs_) {
+      handle->Ref();
     }
   }
 
-  ~ExecuteNode() override {
-    if (async_) {
-      for (auto handle : retvals_) {
-        handle->Unref();
-      }
+  ~AsyncExecuteNode() override {
+    for (auto handle : retvals_) {
+      handle->Unref();
+    }
 
-      for (auto handle : inputs_) {
-        handle->Unref();
-      }
+    for (auto handle : inputs_) {
+      handle->Unref();
     }
   }
 
   Status Run() override {
+    int i = 0;
+    for (TensorHandle* h : inputs_) {
+      if (h->RefCountIsOne()) {
+        const Device* d = ctx_->CanonicalDevice(kernel_->InputDevice(i));
+        Status s = h->Unprotect(d);
+        if (!s.ok()) {
+          VLOG(1) << "Unable to unprotect tensor: " << s;
+        }
+      }
+      ++i;
+    }
     const Status status = EagerKernelExecute(
         ctx_, inputs_, remote_func_params_, kernel_, graph_collector_,
         cancellation_manager_, absl::MakeSpan(retvals_));
@@ -137,21 +194,20 @@ class ExecuteNode : public EagerNode {
     }
   }
 
-  string DebugString() const override {
-    string out = "[ExecuteNode]";
+  std::string DebugString() const override {
+    std::string out = "[AsyncExecuteNode]";
     strings::StrAppend(&out, " kernel: ", kernel_->name());
     return out;
   }
 
  private:
   EagerContext* ctx_;
-  gtl::InlinedVector<TensorHandle*, 4> inputs_;
+  absl::InlinedVector<TensorHandle*, 4> inputs_;
   const absl::optional<EagerRemoteFunctionParams> remote_func_params_;
   core::RefCountPtr<KernelAndDevice> kernel_;
   GraphCollector* graph_collector_;
   CancellationManager* const cancellation_manager_;
-  const bool async_;
-  gtl::InlinedVector<TensorHandle*, 2> retvals_;
+  absl::InlinedVector<TensorHandle*, 2> retvals_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 9e49cd1fb87..ef2b3104ed8 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/substitute.h"
+#include "absl/types/variant.h"
 #include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
@@ -222,7 +223,7 @@ TensorHandle::TensorHandle(std::unique_ptr<EmptyLocalTensorHandleData> t,
       implicit_mirroring_(true),
       is_ready_(!async),
       tensor_handle_data_(std::move(t)) {
-  DVLOG(3) << "Creating Async Local TensorHandle: " << this
+  DVLOG(3) << "Creating empty Local TensorHandle: " << this
            << " device: " << VariantDeviceDebugString(device_);
 }
 
@@ -494,6 +495,26 @@ Status TensorHandle::NumElements(int64* num_elements) const {
   }
 }
 
+Status TensorHandle::Unprotect(const Device* d) {
+  if (d == absl::get<Device*>(device_)) {
+    return tensor_handle_data_->Unprotect();
+  }
+
+  tf_shared_lock l(mu_);
+  auto mirror = local_mirrors_.find(d);
+  if (mirror != local_mirrors_.end()) {
+    return mirror->second->Unprotect();
+  }
+
+  auto empty_mirror = empty_local_mirrors_.find(d);
+  if (empty_mirror != empty_local_mirrors_.end()) {
+    return errors::Internal("Attempted to unprotect an empty mirror");
+  }
+
+  return errors::Internal("Invalid device: ", d,
+                          " in Unprotect call to handle: ", this);
+}
+
 bool TensorHandle::HasLocalMirror(Device* d) {
   mutex_lock l(mu_);
   auto mirror = local_mirrors_.find(d);
@@ -653,7 +674,7 @@ Status TensorHandle::SetRemoteShape(const TensorShape& shape,
     return Status::OK();
   }
 
-  DCHECK(is_remote_) << "SeRemoteShape is only called on remote handles.";
+  DCHECK(is_remote_) << "SetRemoteShape is only called on remote handles.";
   DCHECK(!IsReady()) << "SetRemoteShape is only called on non-ready handles.";
 
   UnshapedRemoteTensorHandleData* p =
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index 2024111ef35..bae03a96f33 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -46,7 +46,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 
 #include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 
 #include "tensorflow/core/platform/fingerprint.h"
@@ -147,6 +146,8 @@ class TensorHandle : public core::RefCounted {
   Status Dim(int dim_index, int64* dim) const;
   Status NumElements(int64* num_elements) const;
 
+  Status Unprotect(const Device* d);
+
   // Checks if a mirror tensor exists for the specified device. Mirrors are only
   // maintained for local devices, like CPUs & GPUs. Note a mirror may be empty,
   // as it is still to be set by an async operation.
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_data.cc b/tensorflow/core/common_runtime/eager/tensor_handle_data.cc
index d718e39687f..b6d17e1ee1a 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_data.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_data.cc
@@ -58,6 +58,12 @@ Status LocalTensorHandleData::NumElements(int64* num_elements) const {
   return Status::OK();
 }
 
+Status LocalTensorHandleData::Unprotect() {
+  forwarding_protection_tensor_ = tensorflow::Tensor();
+
+  return Status::OK();
+}
+
 Status EmptyLocalTensorHandleData::Tensor(const tensorflow::Tensor** t) const {
   return errors::Unavailable(
       "Unable to get a tensor for an empty handle. "
@@ -94,6 +100,10 @@ Status EmptyLocalTensorHandleData::NumElements(int64* num_elements) const {
       "Please wait until it is ready");
 }
 
+Status EmptyLocalTensorHandleData::Unprotect() {
+  return errors::Unavailable("Unable to unprotect an empty handle.");
+}
+
 string EmptyLocalTensorHandleData::DebugString() const {
   return "EmptyLocalTensorHandleData";
 }
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_data.h b/tensorflow/core/common_runtime/eager/tensor_handle_data.h
index e50200277f1..5e600cc8818 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_data.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_data.h
@@ -34,6 +34,9 @@ class TensorHandleData {
   virtual Status NumDims(int* num_dims) const = 0;
   virtual Status Dim(int dim_index, int64* dim) const = 0;
   virtual Status NumElements(int64* num_elements) const = 0;
+  // Allow the backing Tensor to be available for buffer reuse during op
+  // execution.
+  virtual Status Unprotect() = 0;
 
   virtual string DebugString() const = 0;
 };
@@ -41,7 +44,8 @@ class TensorHandleData {
 // Local Tensor Handle: Handle to a Tensor present on the local host.
 class LocalTensorHandleData : public TensorHandleData {
  public:
-  explicit LocalTensorHandleData(const tensorflow::Tensor& t) : tensor_(t) {}
+  explicit LocalTensorHandleData(const tensorflow::Tensor& t)
+      : tensor_(t), forwarding_protection_tensor_(t) {}
   ~LocalTensorHandleData() override {}
 
   // A local tensor handle should be able to satisfy all of these requests.
@@ -51,11 +55,19 @@ class LocalTensorHandleData : public TensorHandleData {
   Status NumDims(int* num_dims) const override;
   Status Dim(int dim_index, int64* dim) const override;
   Status NumElements(int64* num_elements) const override;
+  Status Unprotect() override;
 
   string DebugString() const override { return tensor_.DebugString(); }
 
  private:
   tensorflow::Tensor tensor_;
+  // TensorHandle has its own reference counting which is distinct from the
+  // backing Tensor. As a result, if the Tensor reference count is 1 while
+  // executing an op, the TensorBuffer could be reused for the output. We avoid
+  // this behavior maintaining another reference count with the
+  // forwarding_protection_tensor_ Tensor. When Unprotect() is called, we
+  // release this Tensor to allow forwarding.
+  tensorflow::Tensor forwarding_protection_tensor_;
 };
 
 // Empty Local Tensor Handle: Once the execution is complete this is replaced by
@@ -73,6 +85,7 @@ class EmptyLocalTensorHandleData : public TensorHandleData {
   Status NumDims(int* num_dims) const override;
   Status Dim(int dim_index, int64* dim) const override;
   Status NumElements(int64* num_elements) const override;
+  Status Unprotect() override;
 
   string DebugString() const override;
 };
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
index af63c20a7f4..e083aedcc47 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
@@ -142,6 +142,10 @@ Status RemoteTensorHandleData::NumElements(int64* num_elements) const {
   return Status::OK();
 }
 
+Status RemoteTensorHandleData::Unprotect() {
+  return errors::Unavailable("Unable to unprotect a remote handle.");
+}
+
 string RemoteTensorHandleData::DebugString() const {
   return strings::StrCat("RemoteTensorHandleData:", " op_id: ", op_id_,
                          " output_num: ", output_num_);
@@ -207,6 +211,10 @@ Status UnshapedRemoteTensorHandleData::NumElements(int64* num_elements) const {
       "until it is ready");
 }
 
+Status UnshapedRemoteTensorHandleData::Unprotect() {
+  return errors::Unavailable("Unable to unprotect a remote handle.");
+}
+
 string UnshapedRemoteTensorHandleData::DebugString() const {
   return strings::StrCat("UnshapedRemoteTensorHandleDat:", " op_id: ", op_id_,
                          " output_num: ", output_num_);
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
index effcefe742e..56c51beffb0 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
@@ -37,6 +37,7 @@ class RemoteTensorHandleData : public TensorHandleData {
   Status NumDims(int* num_dims) const override;
   Status Dim(int dim_index, int64* dim) const override;
   Status NumElements(int64* num_elements) const override;
+  Status Unprotect() override;
 
   string DebugString() const override;
 
@@ -70,6 +71,7 @@ class UnshapedRemoteTensorHandleData : public TensorHandleData {
   Status NumDims(int* num_dims) const override;
   Status Dim(int dim_index, int64* dim) const override;
   Status NumElements(int64* num_elements) const override;
+  Status Unprotect() override;
 
   string DebugString() const override;
 

From d3beb51ab8f80d34260b0616f33b5afbcf3d8a6b Mon Sep 17 00:00:00 2001
From: Thomas O'Malley <omalleyt@google.com>
Date: Thu, 20 Feb 2020 09:48:10 -0800
Subject: [PATCH 350/442] Revert Dense layer changes that attempt to support 1d
 inputs.

PiperOrigin-RevId: 296232311
Change-Id: Iffc4ea2267b8846dec15397328ef9bf1ddc21760
---
 tensorflow/python/keras/layers/core.py | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index 65aadd7cd08..32ad7a89b77 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -1122,17 +1122,11 @@ class Dense(Layer):
       raise TypeError('Unable to build `Dense` layer with non-floating point '
                       'dtype %s' % (dtype,))
     input_shape = tensor_shape.TensorShape(input_shape)
-    # Handle 1-d inputs by reshaping to (-1, 1).
-    if input_shape.rank == 1:
-      input_shape = tensor_shape.TensorShape(input_shape.as_list() + [1])
-      last_dim = tensor_shape.dimension_value(1)
-      self.input_spec = InputSpec(min_ndim=1, max_ndim=2)
-    else:
-      if tensor_shape.dimension_value(input_shape[-1]) is None:
-        raise ValueError('The last dimension of the inputs to `Dense` '
-                         'should be defined. Found `None`.')
-      last_dim = tensor_shape.dimension_value(input_shape[-1])
-      self.input_spec = InputSpec(min_ndim=2, axes={-1: last_dim})
+    if tensor_shape.dimension_value(input_shape[-1]) is None:
+      raise ValueError('The last dimension of the inputs to `Dense` '
+                       'should be defined. Found `None`.')
+    last_dim = tensor_shape.dimension_value(input_shape[-1])
+    self.input_spec = InputSpec(min_ndim=2, axes={-1: last_dim})
     self.kernel = self.add_weight(
         'kernel',
         shape=[last_dim, self.units],
@@ -1165,8 +1159,6 @@ class Dense(Layer):
         output_shape = shape[:-1] + [self.units]
         outputs.set_shape(output_shape)
     else:
-      if rank == 1:
-        inputs = array_ops.expand_dims_v2(inputs, axis=-1)
       inputs = math_ops.cast(inputs, self._compute_dtype)
       if K.is_sparse(inputs):
         outputs = sparse_ops.sparse_tensor_dense_matmul(inputs, self.kernel)

From 33b8e22a6ef9f2340dc40064c60d4ad91558126f Mon Sep 17 00:00:00 2001
From: Yunlu Li <yunluli@google.com>
Date: Thu, 20 Feb 2020 09:50:53 -0800
Subject: [PATCH 351/442] Internal change only.

PiperOrigin-RevId: 296232961
Change-Id: I083e446349c1c68b4a21fe58d59a6babe93e69f2
---
 tensorflow/lite/python/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 61e36aac4b7..a1f9baf7c7d 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -39,6 +39,7 @@ py_test(
         "no_windows",
         "noasan",  # TODO(b/137568139): enable after this is fixed.
         "nomsan",  # TODO(b/137568139): enable after this is fixed.
+        "notsan",  # TODO(b/149882556): enable after this is fixed.
     ],
     deps = [
         ":interpreter",

From e85f354bba3e20224a1bd3df91b47161c8218592 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 20 Feb 2020 09:55:51 -0800
Subject: [PATCH 352/442] Remove spurious std:cerr debugging statement.

PiperOrigin-RevId: 296234228
Change-Id: I4817dbcaf48ab37fe68df18bad5f030746099341
---
 tensorflow/core/common_runtime/dynamic_device_mgr.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/dynamic_device_mgr.cc b/tensorflow/core/common_runtime/dynamic_device_mgr.cc
index a38c74dd4b3..f7e2e27e4ab 100644
--- a/tensorflow/core/common_runtime/dynamic_device_mgr.cc
+++ b/tensorflow/core/common_runtime/dynamic_device_mgr.cc
@@ -194,7 +194,6 @@ Device* DynamicDeviceMgr::HostCPU() const {
   }
   cpu_device_ = nullptr;
   for (const auto& pair : dynamic_devices_) {
-    std::cerr << "WOWZA: " << pair.first << std::endl;
     if (pair.first->device_type() == DEVICE_CPU) {
       cpu_device_ = pair.first;
       break;

From 51c182c4a35e55b969f9934f9ba85d840cfb4b92 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Thu, 20 Feb 2020 10:01:00 -0800
Subject: [PATCH 353/442] Add a `classifier_activation` option to
 keras.applications

Defaults to `softmax` (the current behavior) but now users have the option of turning it off, or setting a different activation function.

PiperOrigin-RevId: 296235461
Change-Id: Iea01b8a2b260ece5e91b4dc1e6e42526136a2066
---
 .../python/keras/applications/densenet.py     | 29 ++++--
 .../python/keras/applications/efficientnet.py | 45 +++++----
 .../keras/applications/imagenet_utils.py      | 25 +++++
 .../keras/applications/inception_resnet_v2.py | 12 ++-
 .../python/keras/applications/inception_v3.py | 26 +++--
 .../python/keras/applications/mobilenet.py    | 13 ++-
 .../python/keras/applications/mobilenet_v2.py | 14 ++-
 .../python/keras/applications/nasnet.py       | 40 +++++---
 .../python/keras/applications/resnet.py       | 13 ++-
 .../python/keras/applications/resnet_v2.py    | 98 ++++++++++++++-----
 tensorflow/python/keras/applications/vgg16.py | 31 ++++--
 tensorflow/python/keras/applications/vgg19.py | 26 +++--
 .../python/keras/applications/xception.py     | 26 +++--
 13 files changed, 285 insertions(+), 113 deletions(-)

diff --git a/tensorflow/python/keras/applications/densenet.py b/tensorflow/python/keras/applications/densenet.py
index 237202ff429..9a7be9a3b7a 100644
--- a/tensorflow/python/keras/applications/densenet.py
+++ b/tensorflow/python/keras/applications/densenet.py
@@ -125,13 +125,16 @@ def conv_block(x, growth_rate, name):
   return x
 
 
-def DenseNet(blocks,
-             include_top=True,
-             weights='imagenet',
-             input_tensor=None,
-             input_shape=None,
-             pooling=None,
-             classes=1000):
+def DenseNet(
+    blocks,
+    include_top=True,
+    weights='imagenet',
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation='softmax',
+):
   """Instantiates the DenseNet architecture.
 
   Optionally loads weights pre-trained on ImageNet.
@@ -169,13 +172,18 @@ def DenseNet(blocks,
     classes: optional number of classes to classify images
       into, only to be specified if `include_top` is True, and
       if no `weights` argument is specified.
+    classifier_activation: A `str` or callable. The activation function to use
+      on the "top" layer. Ignored unless `include_top=True`. Set
+      `classifier_activation=None` to return the logits of the "top" layer.
 
   Returns:
-    A Keras model instance.
+    A `keras.Model` instance.
 
   Raises:
     ValueError: in case of invalid argument for `weights`,
       or invalid input shape.
+    ValueError: if `classifier_activation` is not `softmax` or `None` when
+      using a pretrained top layer.
   """
   if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
@@ -228,7 +236,10 @@ def DenseNet(blocks,
 
   if include_top:
     x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
-    x = layers.Dense(classes, activation='softmax', name='fc1000')(x)
+
+    imagenet_utils.validate_activation(classifier_activation, weights)
+    x = layers.Dense(classes, activation=classifier_activation,
+                     name='predictions')(x)
   else:
     if pooling == 'avg':
       x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
diff --git a/tensorflow/python/keras/applications/efficientnet.py b/tensorflow/python/keras/applications/efficientnet.py
index f3d0f1e5b0e..11ba3a98b7e 100644
--- a/tensorflow/python/keras/applications/efficientnet.py
+++ b/tensorflow/python/keras/applications/efficientnet.py
@@ -141,21 +141,24 @@ DENSE_KERNEL_INITIALIZER = {
 }
 
 
-def EfficientNet(width_coefficient,
-                 depth_coefficient,
-                 default_size,
-                 dropout_rate=0.2,
-                 drop_connect_rate=0.2,
-                 depth_divisor=8,
-                 activation='swish',
-                 blocks_args='default',
-                 model_name='efficientnet',
-                 include_top=True,
-                 weights='imagenet',
-                 input_tensor=None,
-                 input_shape=None,
-                 pooling=None,
-                 classes=1000):
+def EfficientNet(
+    width_coefficient,
+    depth_coefficient,
+    default_size,
+    dropout_rate=0.2,
+    drop_connect_rate=0.2,
+    depth_divisor=8,
+    activation='swish',
+    blocks_args='default',
+    model_name='efficientnet',
+    include_top=True,
+    weights='imagenet',
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation='softmax',
+):
   """Instantiates the EfficientNet architecture using given scaling coefficients.
 
   Optionally loads weights pre-trained on ImageNet.
@@ -197,13 +200,18 @@ def EfficientNet(width_coefficient,
     classes: optional number of classes to classify images
         into, only to be specified if `include_top` is True, and
         if no `weights` argument is specified.
+    classifier_activation: A `str` or callable. The activation function to use
+        on the "top" layer. Ignored unless `include_top=True`. Set
+        `classifier_activation=None` to return the logits of the "top" layer.
 
   Returns:
-    A Keras model instance.
+    A `keras.Model` instance.
 
   Raises:
     ValueError: in case of invalid argument for `weights`,
       or invalid input shape.
+    ValueError: if `classifier_activation` is not `softmax` or `None` when
+      using a pretrained top layer.
   """
   if blocks_args == 'default':
     blocks_args = DEFAULT_BLOCKS_ARGS
@@ -307,11 +315,12 @@ def EfficientNet(width_coefficient,
     x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
     if dropout_rate > 0:
       x = layers.Dropout(dropout_rate, name='top_dropout')(x)
+    imagenet_utils.validate_activation(classifier_activation, weights)
     x = layers.Dense(
         classes,
-        activation='softmax',
+        activation=classifier_activation,
         kernel_initializer=DENSE_KERNEL_INITIALIZER,
-        name='probs')(x)
+        name='predictions')(x)
   else:
     if pooling == 'avg':
       x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
diff --git a/tensorflow/python/keras/applications/imagenet_utils.py b/tensorflow/python/keras/applications/imagenet_utils.py
index 206be8406ee..55299ebfa50 100644
--- a/tensorflow/python/keras/applications/imagenet_utils.py
+++ b/tensorflow/python/keras/applications/imagenet_utils.py
@@ -22,6 +22,7 @@ import warnings
 
 import numpy as np
 
+from tensorflow.python.keras import activations
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.util.tf_export import keras_export
@@ -355,3 +356,27 @@ def correct_pad(inputs, kernel_size):
   correct = (kernel_size[0] // 2, kernel_size[1] // 2)
   return ((correct[0] - adjust[0], correct[0]),
           (correct[1] - adjust[1], correct[1]))
+
+
+def validate_activation(classifier_activation, weights):
+  """validates that the classifer_activation is compatible with the weights.
+
+  Args:
+    classifier_activation: str or callable activation function
+    weights: The pretrained weights to load.
+
+  Raises:
+    ValueError: if an activation other than `None` or `softmax` are used with
+      pretrained weights.
+  """
+  if weights is None:
+    return
+
+  classifier_activation = activations.get(classifier_activation)
+  if classifier_activation not in [
+      activations.get('softmax'),
+      activations.get(None)
+  ]:
+    raise ValueError('Only `None` and `softmax` activations are allowed '
+                     'for the `classifier_activation` argument when using '
+                     'pretrained weights, with `include_top=True`')
diff --git a/tensorflow/python/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/applications/inception_resnet_v2.py
index 092343144c7..ab8ab71e3b0 100644
--- a/tensorflow/python/keras/applications/inception_resnet_v2.py
+++ b/tensorflow/python/keras/applications/inception_resnet_v2.py
@@ -48,6 +48,7 @@ def InceptionResNetV2(include_top=True,
                       input_shape=None,
                       pooling=None,
                       classes=1000,
+                      classifier_activation='softmax',
                       **kwargs):
   """Instantiates the Inception-ResNet v2 architecture.
 
@@ -82,14 +83,19 @@ def InceptionResNetV2(include_top=True,
     classes: optional number of classes to classify images
       into, only to be specified if `include_top` is `True`, and
       if no `weights` argument is specified.
+    classifier_activation: A `str` or callable. The activation function to use
+      on the "top" layer. Ignored unless `include_top=True`. Set
+      `classifier_activation=None` to return the logits of the "top" layer.
     **kwargs: For backwards compatibility only.
 
   Returns:
-    A Keras `Model` instance.
+    A `keras.Model` instance.
 
   Raises:
     ValueError: in case of invalid argument for `weights`,
       or invalid input shape.
+    ValueError: if `classifier_activation` is not `softmax` or `None` when
+      using a pretrained top layer.
   """
   if 'layers' in kwargs:
     global layers
@@ -189,7 +195,9 @@ def InceptionResNetV2(include_top=True,
   if include_top:
     # Classification block
     x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
-    x = layers.Dense(classes, activation='softmax', name='predictions')(x)
+    imagenet_utils.validate_activation(classifier_activation, weights)
+    x = layers.Dense(classes, activation=classifier_activation,
+                     name='predictions')(x)
   else:
     if pooling == 'avg':
       x = layers.GlobalAveragePooling2D()(x)
diff --git a/tensorflow/python/keras/applications/inception_v3.py b/tensorflow/python/keras/applications/inception_v3.py
index ecec195dff6..f8a56e62234 100644
--- a/tensorflow/python/keras/applications/inception_v3.py
+++ b/tensorflow/python/keras/applications/inception_v3.py
@@ -44,12 +44,15 @@ WEIGHTS_PATH_NO_TOP = (
 
 @keras_export('keras.applications.inception_v3.InceptionV3',
               'keras.applications.InceptionV3')
-def InceptionV3(include_top=True,
-                weights='imagenet',
-                input_tensor=None,
-                input_shape=None,
-                pooling=None,
-                classes=1000):
+def InceptionV3(
+    include_top=True,
+    weights='imagenet',
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation='softmax',
+):
   """Instantiates the Inception v3 architecture.
 
   Reference paper:
@@ -89,13 +92,18 @@ def InceptionV3(include_top=True,
     classes: optional number of classes to classify images
       into, only to be specified if `include_top` is True, and
       if no `weights` argument is specified. Default to 1000.
+    classifier_activation: A `str` or callable. The activation function to use
+      on the "top" layer. Ignored unless `include_top=True`. Set
+      `classifier_activation=None` to return the logits of the "top" layer.
 
   Returns:
-    A Keras `tf.keras.Model` instance.
+    A `keras.Model` instance.
 
   Raises:
     ValueError: in case of invalid argument for `weights`,
       or invalid input shape.
+    ValueError: if `classifier_activation` is not `softmax` or `None` when
+      using a pretrained top layer.
   """
   if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
@@ -309,7 +317,9 @@ def InceptionV3(include_top=True,
   if include_top:
     # Classification block
     x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
-    x = layers.Dense(classes, activation='softmax', name='predictions')(x)
+    imagenet_utils.validate_activation(classifier_activation, weights)
+    x = layers.Dense(classes, activation=classifier_activation,
+                     name='predictions')(x)
   else:
     if pooling == 'avg':
       x = layers.GlobalAveragePooling2D()(x)
diff --git a/tensorflow/python/keras/applications/mobilenet.py b/tensorflow/python/keras/applications/mobilenet.py
index e64efa53815..224e8c84496 100644
--- a/tensorflow/python/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/applications/mobilenet.py
@@ -90,6 +90,7 @@ def MobileNet(input_shape=None,
               input_tensor=None,
               pooling=None,
               classes=1000,
+              classifier_activation='softmax',
               **kwargs):
   """Instantiates the MobileNet architecture.
 
@@ -138,14 +139,18 @@ def MobileNet(input_shape=None,
     classes: Optional number of classes to classify images into, only to be
       specified if `include_top` is True, and if no `weights` argument is
       specified. Defaults to 1000.
+    classifier_activation: A `str` or callable. The activation function to use
+      on the "top" layer. Ignored unless `include_top=True`. Set
+      `classifier_activation=None` to return the logits of the "top" layer.
     **kwargs: For backwards compatibility only.
-
   Returns:
-    A `tf.keras.Model` instance.
+    A `keras.Model` instance.
 
   Raises:
     ValueError: in case of invalid argument for `weights`,
       or invalid input shape.
+    ValueError: if `classifier_activation` is not `softmax` or `None` when
+      using a pretrained top layer.
   """
   if 'layers' in kwargs:
     global layers
@@ -252,7 +257,9 @@ def MobileNet(input_shape=None,
     x = layers.Dropout(dropout, name='dropout')(x)
     x = layers.Conv2D(classes, (1, 1), padding='same', name='conv_preds')(x)
     x = layers.Reshape((classes,), name='reshape_2')(x)
-    x = layers.Activation('softmax', name='act_softmax')(x)
+    imagenet_utils.validate_activation(classifier_activation, weights)
+    x = layers.Activation(activation=classifier_activation,
+                          name='predictions')(x)
   else:
     if pooling == 'avg':
       x = layers.GlobalAveragePooling2D()(x)
diff --git a/tensorflow/python/keras/applications/mobilenet_v2.py b/tensorflow/python/keras/applications/mobilenet_v2.py
index 186b6e3db61..a983f6d7e46 100644
--- a/tensorflow/python/keras/applications/mobilenet_v2.py
+++ b/tensorflow/python/keras/applications/mobilenet_v2.py
@@ -85,7 +85,6 @@ from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
-
 BASE_WEIGHT_PATH = ('https://storage.googleapis.com/tensorflow/'
                     'keras-applications/mobilenet_v2/')
 
@@ -99,6 +98,7 @@ def MobileNetV2(input_shape=None,
                 input_tensor=None,
                 pooling=None,
                 classes=1000,
+                classifier_activation='softmax',
                 **kwargs):
   """Instantiates the MobileNetV2 architecture.
 
@@ -152,6 +152,9 @@ def MobileNetV2(input_shape=None,
     classes: Integer, optional number of classes to classify images
       into, only to be specified if `include_top` is True, and
       if no `weights` argument is specified.
+    classifier_activation: A `str` or callable. The activation function to use
+      on the "top" layer. Ignored unless `include_top=True`. Set
+      `classifier_activation=None` to return the logits of the "top" layer.
     **kwargs: For backwards compatibility only.
 
   Returns:
@@ -161,6 +164,8 @@ def MobileNetV2(input_shape=None,
     ValueError: in case of invalid argument for `weights`,
       or invalid input shape or invalid alpha, rows when
       weights='imagenet'
+    ValueError: if `classifier_activation` is not `softmax` or `None` when
+      using a pretrained top layer.
   """
   if 'layers' in kwargs:
     global layers
@@ -360,9 +365,10 @@ def MobileNetV2(input_shape=None,
 
   if include_top:
     x = layers.GlobalAveragePooling2D()(x)
-    x = layers.Dense(
-        classes, activation='softmax', use_bias=True, name='Logits')(
-            x)
+    imagenet_utils.validate_activation(classifier_activation, weights)
+    x = layers.Dense(classes, activation=classifier_activation,
+                     name='predictions')(x)
+
   else:
     if pooling == 'avg':
       x = layers.GlobalAveragePooling2D()(x)
diff --git a/tensorflow/python/keras/applications/nasnet.py b/tensorflow/python/keras/applications/nasnet.py
index 0a693b83652..a29d5f4c380 100644
--- a/tensorflow/python/keras/applications/nasnet.py
+++ b/tensorflow/python/keras/applications/nasnet.py
@@ -61,18 +61,21 @@ NASNET_LARGE_WEIGHT_PATH = BASE_WEIGHTS_PATH + 'NASNet-large.h5'
 NASNET_LARGE_WEIGHT_PATH_NO_TOP = BASE_WEIGHTS_PATH + 'NASNet-large-no-top.h5'
 
 
-def NASNet(input_shape=None,
-           penultimate_filters=4032,
-           num_blocks=6,
-           stem_block_filters=96,
-           skip_reduction=True,
-           filter_multiplier=2,
-           include_top=True,
-           weights=None,
-           input_tensor=None,
-           pooling=None,
-           classes=1000,
-           default_size=None):
+def NASNet(
+    input_shape=None,
+    penultimate_filters=4032,
+    num_blocks=6,
+    stem_block_filters=96,
+    skip_reduction=True,
+    filter_multiplier=2,
+    include_top=True,
+    weights=None,
+    input_tensor=None,
+    pooling=None,
+    classes=1000,
+    default_size=None,
+    classifier_activation='softmax',
+):
   """Instantiates a NASNet model.
 
   Optionally loads weights pre-trained on ImageNet.
@@ -127,13 +130,18 @@ def NASNet(input_shape=None,
       into, only to be specified if `include_top` is True, and
       if no `weights` argument is specified.
     default_size: Specifies the default image size of the model
+    classifier_activation: A `str` or callable. The activation function to use
+      on the "top" layer. Ignored unless `include_top=True`. Set
+      `classifier_activation=None` to return the logits of the "top" layer.
 
   Returns:
-    A Keras model instance.
+    A `keras.Model` instance.
 
   Raises:
     ValueError: In case of invalid argument for `weights`,
-        invalid input shape or invalid `penultimate_filters` value.
+      invalid input shape or invalid `penultimate_filters` value.
+    ValueError: if `classifier_activation` is not `softmax` or `None` when
+      using a pretrained top layer.
   """
   if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
@@ -247,7 +255,9 @@ def NASNet(input_shape=None,
 
   if include_top:
     x = layers.GlobalAveragePooling2D()(x)
-    x = layers.Dense(classes, activation='softmax', name='predictions')(x)
+    imagenet_utils.validate_activation(classifier_activation, weights)
+    x = layers.Dense(classes, activation=classifier_activation,
+                     name='predictions')(x)
   else:
     if pooling == 'avg':
       x = layers.GlobalAveragePooling2D()(x)
diff --git a/tensorflow/python/keras/applications/resnet.py b/tensorflow/python/keras/applications/resnet.py
index d30b3cca55e..86d26695373 100644
--- a/tensorflow/python/keras/applications/resnet.py
+++ b/tensorflow/python/keras/applications/resnet.py
@@ -61,6 +61,7 @@ def ResNet(stack_fn,
            input_shape=None,
            pooling=None,
            classes=1000,
+           classifier_activation='softmax',
            **kwargs):
   """Instantiates the ResNet, ResNetV2, and ResNeXt architecture.
 
@@ -103,14 +104,18 @@ def ResNet(stack_fn,
     classes: optional number of classes to classify images
       into, only to be specified if `include_top` is True, and
       if no `weights` argument is specified.
+    classifier_activation: A `str` or callable. The activation function to use
+      on the "top" layer. Ignored unless `include_top=True`. Set
+      `classifier_activation=None` to return the logits of the "top" layer.
     **kwargs: For backwards compatibility only.
-
   Returns:
-    A Keras model instance.
+    A `keras.Model` instance.
 
   Raises:
     ValueError: in case of invalid argument for `weights`,
       or invalid input shape.
+    ValueError: if `classifier_activation` is not `softmax` or `None` when
+      using a pretrained top layer.
   """
   if 'layers' in kwargs:
     global layers
@@ -167,7 +172,9 @@ def ResNet(stack_fn,
 
   if include_top:
     x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
-    x = layers.Dense(classes, activation='softmax', name='probs')(x)
+    imagenet_utils.validate_activation(classifier_activation, weights)
+    x = layers.Dense(classes, activation=classifier_activation,
+                     name='predictions')(x)
   else:
     if pooling == 'avg':
       x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
diff --git a/tensorflow/python/keras/applications/resnet_v2.py b/tensorflow/python/keras/applications/resnet_v2.py
index ce56fbb19cb..2e31017dfa9 100644
--- a/tensorflow/python/keras/applications/resnet_v2.py
+++ b/tensorflow/python/keras/applications/resnet_v2.py
@@ -25,56 +25,101 @@ from tensorflow.python.util.tf_export import keras_export
 
 @keras_export('keras.applications.resnet_v2.ResNet50V2',
               'keras.applications.ResNet50V2')
-def ResNet50V2(include_top=True,
-               weights='imagenet',
-               input_tensor=None,
-               input_shape=None,
-               pooling=None,
-               classes=1000):
+def ResNet50V2(
+    include_top=True,
+    weights='imagenet',
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation='softmax',
+):
   """Instantiates the ResNet50V2 architecture."""
   def stack_fn(x):
     x = resnet.stack2(x, 64, 3, name='conv2')
     x = resnet.stack2(x, 128, 4, name='conv3')
     x = resnet.stack2(x, 256, 6, name='conv4')
     return resnet.stack2(x, 512, 3, stride1=1, name='conv5')
-  return resnet.ResNet(stack_fn, True, True, 'resnet50v2', include_top, weights,
-                       input_tensor, input_shape, pooling, classes)
+
+  return resnet.ResNet(
+      stack_fn,
+      True,
+      True,
+      'resnet50v2',
+      include_top,
+      weights,
+      input_tensor,
+      input_shape,
+      pooling,
+      classes,
+      classifier_activation=classifier_activation,
+  )
 
 
 @keras_export('keras.applications.resnet_v2.ResNet101V2',
               'keras.applications.ResNet101V2')
-def ResNet101V2(include_top=True,
-                weights='imagenet',
-                input_tensor=None,
-                input_shape=None,
-                pooling=None,
-                classes=1000):
+def ResNet101V2(
+    include_top=True,
+    weights='imagenet',
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation='softmax',
+):
   """Instantiates the ResNet101V2 architecture."""
   def stack_fn(x):
     x = resnet.stack2(x, 64, 3, name='conv2')
     x = resnet.stack2(x, 128, 4, name='conv3')
     x = resnet.stack2(x, 256, 23, name='conv4')
     return resnet.stack2(x, 512, 3, stride1=1, name='conv5')
-  return resnet.ResNet(stack_fn, True, True, 'resnet101v2', include_top,
-                       weights, input_tensor, input_shape, pooling, classes)
+
+  return resnet.ResNet(
+      stack_fn,
+      True,
+      True,
+      'resnet101v2',
+      include_top,
+      weights,
+      input_tensor,
+      input_shape,
+      pooling,
+      classes,
+      classifier_activation=classifier_activation,
+  )
 
 
 @keras_export('keras.applications.resnet_v2.ResNet152V2',
               'keras.applications.ResNet152V2')
-def ResNet152V2(include_top=True,
-                weights='imagenet',
-                input_tensor=None,
-                input_shape=None,
-                pooling=None,
-                classes=1000):
+def ResNet152V2(
+    include_top=True,
+    weights='imagenet',
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation='softmax',
+):
   """Instantiates the ResNet152V2 architecture."""
   def stack_fn(x):
     x = resnet.stack2(x, 64, 3, name='conv2')
     x = resnet.stack2(x, 128, 8, name='conv3')
     x = resnet.stack2(x, 256, 36, name='conv4')
     return resnet.stack2(x, 512, 3, stride1=1, name='conv5')
-  return resnet.ResNet(stack_fn, True, True, 'resnet152v2', include_top,
-                       weights, input_tensor, input_shape, pooling, classes)
+
+  return resnet.ResNet(
+      stack_fn,
+      True,
+      True,
+      'resnet152v2',
+      include_top,
+      weights,
+      input_tensor,
+      input_shape,
+      pooling,
+      classes,
+      classifier_activation=classifier_activation,
+  )
 
 
 @keras_export('keras.applications.resnet_v2.preprocess_input')
@@ -123,9 +168,12 @@ DOC = """
     classes: optional number of classes to classify images
       into, only to be specified if `include_top` is True, and
       if no `weights` argument is specified.
+    classifier_activation: A `str` or callable. The activation function to use
+      on the "top" layer. Ignored unless `include_top=True`. Set
+      `classifier_activation=None` to return the logits of the "top" layer.
 
   Returns:
-    A Keras model instance.
+    A `keras.Model` instance.
 """
 
 setattr(ResNet50V2, '__doc__', ResNet50V2.__doc__ + DOC)
diff --git a/tensorflow/python/keras/applications/vgg16.py b/tensorflow/python/keras/applications/vgg16.py
index 958ed955106..e268a592833 100644
--- a/tensorflow/python/keras/applications/vgg16.py
+++ b/tensorflow/python/keras/applications/vgg16.py
@@ -37,12 +37,15 @@ WEIGHTS_PATH_NO_TOP = ('https://storage.googleapis.com/tensorflow/'
 
 
 @keras_export('keras.applications.vgg16.VGG16', 'keras.applications.VGG16')
-def VGG16(include_top=True,
-          weights='imagenet',
-          input_tensor=None,
-          input_shape=None,
-          pooling=None,
-          classes=1000):
+def VGG16(
+    include_top=True,
+    weights='imagenet',
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation='softmax',
+):
   """Instantiates the VGG16 model.
 
   By default, it loads weights pre-trained on ImageNet. Check 'weights' for
@@ -85,13 +88,18 @@ def VGG16(include_top=True,
       classes: optional number of classes to classify images
           into, only to be specified if `include_top` is True, and
           if no `weights` argument is specified.
+      classifier_activation: A `str` or callable. The activation function to use
+          on the "top" layer. Ignored unless `include_top=True`. Set
+          `classifier_activation=None` to return the logits of the "top" layer.
 
   Returns:
-      A Keras model instance.
+    A `keras.Model` instance.
 
   Raises:
-      ValueError: in case of invalid argument for `weights`,
-          or invalid input shape.
+    ValueError: in case of invalid argument for `weights`,
+      or invalid input shape.
+    ValueError: if `classifier_activation` is not `softmax` or `None` when
+      using a pretrained top layer.
   """
   if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
@@ -165,7 +173,10 @@ def VGG16(include_top=True,
     x = layers.Flatten(name='flatten')(x)
     x = layers.Dense(4096, activation='relu', name='fc1')(x)
     x = layers.Dense(4096, activation='relu', name='fc2')(x)
-    x = layers.Dense(classes, activation='softmax', name='predictions')(x)
+
+    imagenet_utils.validate_activation(classifier_activation, weights)
+    x = layers.Dense(classes, activation=classifier_activation,
+                     name='predictions')(x)
   else:
     if pooling == 'avg':
       x = layers.GlobalAveragePooling2D()(x)
diff --git a/tensorflow/python/keras/applications/vgg19.py b/tensorflow/python/keras/applications/vgg19.py
index 808580ada07..8d25dc0e42f 100644
--- a/tensorflow/python/keras/applications/vgg19.py
+++ b/tensorflow/python/keras/applications/vgg19.py
@@ -42,12 +42,15 @@ WEIGHTS_PATH_NO_TOP = ('https://storage.googleapis.com/tensorflow/'
 
 
 @keras_export('keras.applications.vgg19.VGG19', 'keras.applications.VGG19')
-def VGG19(include_top=True,
-          weights='imagenet',
-          input_tensor=None,
-          input_shape=None,
-          pooling=None,
-          classes=1000):
+def VGG19(
+    include_top=True,
+    weights='imagenet',
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation='softmax',
+):
   """Instantiates the VGG19 architecture.
 
   By default, it loads weights pre-trained on ImageNet. Check 'weights' for
@@ -90,13 +93,18 @@ def VGG19(include_top=True,
     classes: optional number of classes to classify images
       into, only to be specified if `include_top` is True, and
       if no `weights` argument is specified.
+    classifier_activation: A `str` or callable. The activation function to use
+      on the "top" layer. Ignored unless `include_top=True`. Set
+      `classifier_activation=None` to return the logits of the "top" layer.
 
   Returns:
-    A Keras model instance.
+    A `keras.Model` instance.
 
   Raises:
     ValueError: in case of invalid argument for `weights`,
       or invalid input shape.
+    ValueError: if `classifier_activation` is not `softmax` or `None` when
+      using a pretrained top layer.
   """
   if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
@@ -176,7 +184,9 @@ def VGG19(include_top=True,
     x = layers.Flatten(name='flatten')(x)
     x = layers.Dense(4096, activation='relu', name='fc1')(x)
     x = layers.Dense(4096, activation='relu', name='fc2')(x)
-    x = layers.Dense(classes, activation='softmax', name='predictions')(x)
+    imagenet_utils.validate_activation(classifier_activation, weights)
+    x = layers.Dense(classes, activation=classifier_activation,
+                     name='predictions')(x)
   else:
     if pooling == 'avg':
       x = layers.GlobalAveragePooling2D()(x)
diff --git a/tensorflow/python/keras/applications/xception.py b/tensorflow/python/keras/applications/xception.py
index 47f386cc721..7f6602b90d1 100644
--- a/tensorflow/python/keras/applications/xception.py
+++ b/tensorflow/python/keras/applications/xception.py
@@ -48,12 +48,15 @@ TF_WEIGHTS_PATH_NO_TOP = (
 
 @keras_export('keras.applications.xception.Xception',
               'keras.applications.Xception')
-def Xception(include_top=True,
-             weights='imagenet',
-             input_tensor=None,
-             input_shape=None,
-             pooling=None,
-             classes=1000):
+def Xception(
+    include_top=True,
+    weights='imagenet',
+    input_tensor=None,
+    input_shape=None,
+    pooling=None,
+    classes=1000,
+    classifier_activation='softmax',
+):
   """Instantiates the Xception architecture.
 
   Optionally loads weights pre-trained on ImageNet.
@@ -90,13 +93,18 @@ def Xception(include_top=True,
     classes: optional number of classes to classify images
       into, only to be specified if `include_top` is True,
       and if no `weights` argument is specified.
+    classifier_activation: A `str` or callable. The activation function to use
+      on the "top" layer. Ignored unless `include_top=True`. Set
+      `classifier_activation=None` to return the logits of the "top" layer.
 
   Returns:
-    A Keras model instance.
+    A `keras.Model` instance.
 
   Raises:
     ValueError: in case of invalid argument for `weights`,
       or invalid input shape.
+    ValueError: if `classifier_activation` is not `softmax` or `None` when
+      using a pretrained top layer.
   """
   if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
@@ -260,7 +268,9 @@ def Xception(include_top=True,
 
   if include_top:
     x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
-    x = layers.Dense(classes, activation='softmax', name='predictions')(x)
+    imagenet_utils.validate_activation(classifier_activation, weights)
+    x = layers.Dense(classes, activation=classifier_activation,
+                     name='predictions')(x)
   else:
     if pooling == 'avg':
       x = layers.GlobalAveragePooling2D()(x)

From 5cedee2c760f3462e50943a83d64ce24b27b16fc Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Thu, 20 Feb 2020 10:12:46 -0800
Subject: [PATCH 354/442] Fix doc generator to handle new package layout.

tensorflow_core is gone in tf-nightly.

PiperOrigin-RevId: 296238681
Change-Id: I604be239c807b6e6fb9569560d9f94326b303711
---
 tensorflow/tools/docs/BUILD        |  9 +++++
 tensorflow/tools/docs/base_dir.py  | 52 +++++++++++++++++++++++++++
 tensorflow/tools/docs/generate2.py | 57 ++++++++++++++++++++----------
 3 files changed, 99 insertions(+), 19 deletions(-)
 create mode 100644 tensorflow/tools/docs/base_dir.py

diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index e49c4d29311..d8a45098b78 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -165,11 +165,20 @@ py_binary(
     ],
 )
 
+py_library(
+    # Opensource only
+    name = "base_dir_oss",
+    srcs = ["base_dir.py"],
+    srcs_version = "PY3",
+    deps = [],
+)
+
 py_library(
     name = "generate2_lib",
     srcs = ["generate2.py"],
     srcs_version = "PY3",
     deps = [
+        ":base_dir_oss",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:util",
     ],
diff --git a/tensorflow/tools/docs/base_dir.py b/tensorflow/tools/docs/base_dir.py
new file mode 100644
index 00000000000..b97925d10ae
--- /dev/null
+++ b/tensorflow/tools/docs/base_dir.py
@@ -0,0 +1,52 @@
+# Lint as: python3
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Opensource base_dir configuration for tensorflow doc-generator."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import distutils
+from os import path
+
+import tensorboard
+import tensorflow as tf
+import tensorflow_estimator
+
+
+def get_base_dirs_and_prefixes(code_url_prefix):
+  """Returns the base_dirs and code_prefixes for OSS TensorFlow api gen."""
+  base_dir = path.dirname(tf.__file__)
+
+  if distutils.version.LooseVersion(tf.__version__) >= "2.2":
+    base_dirs = [
+        base_dir,
+        path.dirname(tensorboard.__file__),
+        path.dirname(tensorflow_estimator.__file__),
+    ]
+  else:
+    base_dirs = [
+        path.normpath(path.join(base_dir, "../tensorflow_core")),
+        path.dirname(tensorboard.__file__),
+        path.dirname(tensorflow_estimator.__file__),
+    ]
+
+  code_url_prefixes = (
+      code_url_prefix,
+      "https://github.com/tensorflow/tensorboard/tree/master/tensorboard",
+      "https://github.com/tensorflow/estimator/tree/master/tensorflow_estimator",
+  )
+
+  return base_dirs, code_url_prefixes
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index ff0dd68b326..cb1bfe39c6c 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -30,7 +30,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from os import path
+import pathlib
 import textwrap
 
 from absl import app
@@ -42,12 +42,13 @@ from tensorflow_docs.api_generator import doc_controls
 from tensorflow_docs.api_generator import doc_generator_visitor
 from tensorflow_docs.api_generator import generate_lib
 
-import tensorboard
-import tensorflow_estimator
 from tensorflow.python.framework import ops
 from tensorflow.python.util import tf_export
 from tensorflow.python.util import tf_inspect
 
+# Caution: the google and oss versions of this import are different.
+import base_dir
+
 # `tf` has an `__all__` that doesn't list important things like `keras`.
 # The doc generator recognizes `__all__` as the list of public symbols.
 # So patch `tf.__all__` to list everything.
@@ -202,22 +203,8 @@ def build_docs(output_dir, code_url_prefix, search_hints=True):
   except AttributeError:
     pass
 
-  base_dir = path.normpath(path.join(tf.__file__, "../.."))
-
-  base_dirs = (
-      path.join(base_dir, "tensorflow_core"),
-      # External packages base directories
-      path.dirname(tensorboard.__file__),
-      path.dirname(tensorflow_estimator.__file__),
-  )
-
-  code_url_prefixes = (
-      code_url_prefix,
-      # External packages source repositories,
-      "https://github.com/tensorflow/tensorboard/tree/master/tensorboard",
-      "https://github.com/tensorflow/estimator/tree/master/tensorflow_estimator",
-  )
-
+  base_dirs, code_url_prefixes = base_dir.get_base_dirs_and_prefixes(
+      code_url_prefix)
   doc_generator = generate_lib.DocGenerator(
       root_title="TensorFlow 2",
       py_modules=[("tf", tf)],
@@ -230,6 +217,38 @@ def build_docs(output_dir, code_url_prefix, search_hints=True):
 
   doc_generator.build(output_dir)
 
+  out_path = pathlib.Path(output_dir)
+  num_files = len(list(out_path.rglob("*")))
+  if num_files < 2500:
+    raise ValueError("The TensorFlow api should be more than 2500 files"
+                     "(found {}).".format(num_files))
+  expected_path_contents = {
+      "tf/summary/audio.md":
+          "tensorboard/plugins/audio/summary_v2.py",
+      "tf/estimator/DNNClassifier.md":
+          "tensorflow_estimator/python/estimator/canned/dnn.py",
+      "tf/nn/sigmoid_cross_entropy_with_logits.md":
+          "python/ops/nn_impl.py",
+      "tf/keras/Model.md":
+          "tensorflow/python/keras/engine/training.py",
+      "tf/compat/v1/gradients.md":
+          "tensorflow/python/ops/gradients_impl.py",
+  }
+
+  all_passed = True
+  error_msg_parts = [
+      'Some "view source" links seem to be broken, please check:'
+  ]
+
+  for (rel_path, contents) in expected_path_contents.items():
+    path = out_path / rel_path
+    if contents not in path.read_text():
+      all_passed = False
+      error_msg_parts.append("  " + str(path))
+
+  if not all_passed:
+    raise ValueError("\n".join(error_msg_parts))
+
 
 def main(argv):
   del argv

From 49333f5489488f7a7a8bb24987b89b7c9efe9e8d Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Thu, 20 Feb 2020 10:17:33 -0800
Subject: [PATCH 355/442] Use Env::LocalTempFilename for a temp filename.

This function works both in and outside of tests. Additionally,
LocalTempFilename works well on Windows where as TmpDir is a little problematic
because of bazel oddities.

PiperOrigin-RevId: 296239768
Change-Id: Ie1c44de9f4a0b31100ec66979152c39a5e2a965f
---
 .../kernels/data/text_line_dataset_op_test.cc  | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/kernels/data/text_line_dataset_op_test.cc b/tensorflow/core/kernels/data/text_line_dataset_op_test.cc
index f4c9589856d..e3f6e739ea8 100644
--- a/tensorflow/core/kernels/data/text_line_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/text_line_dataset_op_test.cc
@@ -19,6 +19,12 @@ namespace {
 
 constexpr char kNodeName[] = "text_line_dataset";
 
+tstring LocalTempFilename() {
+  std::string path;
+  CHECK(Env::Default()->LocalTempFilename(&path));
+  return tstring(path);
+}
+
 class TextLineDatasetParams : public DatasetParams {
  public:
   TextLineDatasetParams(std::vector<tstring> filenames,
@@ -84,9 +90,7 @@ Status CreateTestFiles(const std::vector<tstring>& filenames,
 
 // Test case 1: multiple text files with ZLIB compression.
 TextLineDatasetParams TextLineDatasetParams1() {
-  std::vector<tstring> filenames = {
-      absl::StrCat(testing::TmpDir(), "/text_line_ZLIB_1"),
-      absl::StrCat(testing::TmpDir(), "/text_line_ZLIB_2")};
+  std::vector<tstring> filenames = {LocalTempFilename(), LocalTempFilename()};
   std::vector<tstring> contents = {
       absl::StrCat("hello world\n", "11223334455\n"),
       absl::StrCat("abcd, EFgH\n", "           \n", "$%^&*()\n")};
@@ -103,9 +107,7 @@ TextLineDatasetParams TextLineDatasetParams1() {
 
 // Test case 2: multiple text files with GZIP compression.
 TextLineDatasetParams TextLineDatasetParams2() {
-  std::vector<tstring> filenames = {
-      absl::StrCat(testing::TmpDir(), "/text_line_GZIP_1"),
-      absl::StrCat(testing::TmpDir(), "/text_line_GZIP_2")};
+  std::vector<tstring> filenames = {LocalTempFilename(), LocalTempFilename()};
   std::vector<tstring> contents = {
       absl::StrCat("hello world\n", "11223334455\n"),
       absl::StrCat("abcd, EFgH\n", "           \n", "$%^&*()\n")};
@@ -122,9 +124,7 @@ TextLineDatasetParams TextLineDatasetParams2() {
 
 // Test case 3: multiple text files without compression.
 TextLineDatasetParams TextLineDatasetParams3() {
-  std::vector<tstring> filenames = {
-      absl::StrCat(testing::TmpDir(), "/text_line_UNCOMPRESSED_1"),
-      absl::StrCat(testing::TmpDir(), "/text_line_UNCOMPRESSED_2")};
+  std::vector<tstring> filenames = {LocalTempFilename(), LocalTempFilename()};
   std::vector<tstring> contents = {
       absl::StrCat("hello world\n", "11223334455\n"),
       absl::StrCat("abcd, EFgH\n", "           \n", "$%^&*()\n")};

From 0fa7a0b0339c3fd7264f1259a4a60be43bb6c5dc Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Thu, 20 Feb 2020 10:17:48 -0800
Subject: [PATCH 356/442] Use Env::LocalTempFilename for a temp filename.

This function works both in and outside of tests. Additionally,
LocalTempFilename works well on Windows where as TmpDir is a little problematic
because of bazel oddities.

PiperOrigin-RevId: 296239824
Change-Id: I4e636bf150fc5554503e14361a5953598c9db638
---
 .../fixed_length_record_dataset_op_test.cc     | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/kernels/data/fixed_length_record_dataset_op_test.cc b/tensorflow/core/kernels/data/fixed_length_record_dataset_op_test.cc
index 8ffe8f50f96..4eab5ed08f3 100644
--- a/tensorflow/core/kernels/data/fixed_length_record_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/fixed_length_record_dataset_op_test.cc
@@ -20,6 +20,12 @@ namespace {
 constexpr char kNodeName[] = "fixed_length_record_dataset";
 constexpr int kOpVersion = 2;
 
+tstring LocalTempFilename() {
+  std::string path;
+  CHECK(Env::Default()->LocalTempFilename(&path));
+  return tstring(path);
+}
+
 class FixedLengthRecordDatasetParams : public DatasetParams {
  public:
   FixedLengthRecordDatasetParams(const std::vector<tstring>& filenames,
@@ -105,9 +111,7 @@ Status CreateTestFiles(const std::vector<tstring>& filenames,
 
 // Test case 1: multiple fixed-length record files with ZLIB compression.
 FixedLengthRecordDatasetParams FixedLengthRecordDatasetParams1() {
-  std::vector<tstring> filenames = {
-      absl::StrCat(testing::TmpDir(), "/text_line_ZLIB_1"),
-      absl::StrCat(testing::TmpDir(), "/text_line_ZLIB_2")};
+  std::vector<tstring> filenames = {LocalTempFilename(), LocalTempFilename()};
   std::vector<string> contents = {
       absl::StrCat("HHHHH", "111", "222", "333", "FF"),
       absl::StrCat("HHHHH", "aaa", "bbb", "FF")};
@@ -128,9 +132,7 @@ FixedLengthRecordDatasetParams FixedLengthRecordDatasetParams1() {
 
 // Test case 2: multiple fixed-length record files with GZIP compression.
 FixedLengthRecordDatasetParams FixedLengthRecordDatasetParams2() {
-  std::vector<tstring> filenames = {
-      absl::StrCat(testing::TmpDir(), "/text_line_GZIP_1"),
-      absl::StrCat(testing::TmpDir(), "/text_line_GZIP_2")};
+  std::vector<tstring> filenames = {LocalTempFilename(), LocalTempFilename()};
   std::vector<string> contents = {
       absl::StrCat("HHHHH", "111", "222", "333", "FF"),
       absl::StrCat("HHHHH", "aaa", "bbb", "FF")};
@@ -150,9 +152,7 @@ FixedLengthRecordDatasetParams FixedLengthRecordDatasetParams2() {
 
 // Test case 3: multiple fixed-length record files without compression.
 FixedLengthRecordDatasetParams FixedLengthRecordDatasetParams3() {
-  std::vector<tstring> filenames = {
-      absl::StrCat(testing::TmpDir(), "/text_line_UNCOMPRESSED_1"),
-      absl::StrCat(testing::TmpDir(), "/text_line_UNCOMPRESSED_2")};
+  std::vector<tstring> filenames = {LocalTempFilename(), LocalTempFilename()};
   std::vector<string> contents = {
       absl::StrCat("HHHHH", "111", "222", "333", "FF"),
       absl::StrCat("HHHHH", "aaa", "bbb", "FF")};

From 3113c74febd688a6046c34dc388dea2ff26a4a5b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 20 Feb 2020 10:27:37 -0800
Subject: [PATCH 357/442] give sendop/recvop some temporary attribute for
 tracing. so we can annotate the memcpy device events better.

PiperOrigin-RevId: 296242264
Change-Id: Ib515bc56faf37ede6610b62c8aaab3ab66ef6830
---
 tensorflow/core/common_runtime/copy_tensor.cc |  3 +-
 .../core/common_runtime/memory_types.cc       |  4 +++
 tensorflow/core/graph/graph_partition.cc      |  2 ++
 tensorflow/core/kernels/BUILD                 |  1 +
 tensorflow/core/kernels/sendrecv_ops.cc       | 34 +++++++++++++++++++
 tensorflow/core/kernels/sendrecv_ops.h        |  4 +++
 6 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/copy_tensor.cc b/tensorflow/core/common_runtime/copy_tensor.cc
index 2a071e44a5c..cc4921e5781 100644
--- a/tensorflow/core/common_runtime/copy_tensor.cc
+++ b/tensorflow/core/common_runtime/copy_tensor.cc
@@ -204,7 +204,8 @@ void CopyTensor::ViaDMA(StringPiece edge_name, DeviceContext* send_dev_context,
                         const Tensor* input, Tensor* output,
                         int dev_to_dev_stream_index, StatusCallback done,
                         bool sync_dst_compute) {
-  profiler::ScopedAnnotation annotation(edge_name);
+  profiler::ScopedAnnotation annotation(
+      [&] { return absl::StrCat("#edge_name=", edge_name, "#"); });
   VLOG(1) << "Copy " << edge_name;
 
   const DeviceType src_device_type(
diff --git a/tensorflow/core/common_runtime/memory_types.cc b/tensorflow/core/common_runtime/memory_types.cc
index 4088165fac4..b37e65a7ca5 100644
--- a/tensorflow/core/common_runtime/memory_types.cc
+++ b/tensorflow/core/common_runtime/memory_types.cc
@@ -129,6 +129,8 @@ static Node* Send(Graph* g, const string& tensor_name,
                   .Attr("send_device_incarnation", 0)  // Do not care.
                   .Attr("recv_device", device_name)
                   .Attr("_hostmem_sendrecv", true)
+                  .Attr("_src", edge->src()->name())
+                  .Attr("_dst", edge->dst()->name())
                   .Finalize(g, &ret));
   return ret;
 }
@@ -144,6 +146,8 @@ static Node* Recv(Graph* g, const string& tensor_name,
           .Attr("send_device_incarnation", 0)
           .Attr("recv_device", device_name)
           .Attr("_hostmem_sendrecv", true)
+          .Attr("_src", edge->src()->name())
+          .Attr("_dst", edge->dst()->name())
           .Finalize(g, &ret));
   return ret;
 }
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index 65b341fbae0..bf57e263441 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -189,6 +189,8 @@ void SetSendRecvAttrs(const PartitionOptions& opts, const Edge* edge,
                     opts.get_incarnation(edge->src()->assigned_device_name())));
   builder->Attr("recv_device", edge->dst()->assigned_device_name());
   builder->Attr("client_terminated", false);
+  builder->Attr("_src", edge->src()->name());
+  builder->Attr("_dst", edge->dst()->name());
 }
 
 NodeDef* AddSend(const PartitionOptions& opts, const GraphInfo& g_info,
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index f940866da5f..e42de02b979 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -5175,6 +5175,7 @@ cc_library(
 REQUIRED_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
+    "//tensorflow/core:protos_all_cc",
 ]
 
 tf_kernel_library(
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index 7e0e3496645..12456037415 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/sendrecv_ops.h"
 
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -107,6 +109,22 @@ void SendOp::Compute(OpKernelContext* ctx) {
   }
 }
 
+string SendOp::TraceString(OpKernelContext* ctx, bool verbose) {
+  const auto& attr = def().attr();
+  auto src_it = attr.find("_src");
+  auto dst_it = attr.find("_dst");
+  const string& src = src_it != attr.end() ? src_it->second.s() : "";
+  const string& dst = dst_it != attr.end() ? dst_it->second.s() : "";
+  if (!verbose) {
+    return strings::StrCat(name_view(), ":", type_string_view(), "#from=", src,
+                           ",to=", dst, "#");
+  } else {
+    string trace_args = GetTraceArgument(ctx);
+    return strings::StrCat(name_view(), ":", type_string_view(), "#from=", src,
+                           ",to=", dst, ",", trace_args, "#");
+  }
+}
+
 REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_CPU), SendOp);
 REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_DEFAULT), SendOp);
 
@@ -139,6 +157,22 @@ RecvOp::RecvOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
   }
 }
 
+string RecvOp::TraceString(OpKernelContext* ctx, bool verbose) {
+  const auto& attr = def().attr();
+  auto src_it = attr.find("_src");
+  auto dst_it = attr.find("_dst");
+  const string& src = src_it != attr.end() ? src_it->second.s() : "";
+  const string& dst = dst_it != attr.end() ? dst_it->second.s() : "";
+  if (!verbose) {
+    return strings::StrCat(name_view(), ":", type_string_view(), "#from=", src,
+                           ",to=", dst, "#");
+  } else {
+    string trace_args = GetTraceArgument(ctx);
+    return strings::StrCat(name_view(), ":", type_string_view(), "#from=", src,
+                           ",to=", dst, ",", trace_args, "#");
+  }
+}
+
 namespace {
 Rendezvous::DoneCallback make_recv_callback(OpKernelContext* ctx,
                                             AsyncOpKernel::DoneCallback done) {
diff --git a/tensorflow/core/kernels/sendrecv_ops.h b/tensorflow/core/kernels/sendrecv_ops.h
index 223854de132..06c5663bc04 100644
--- a/tensorflow/core/kernels/sendrecv_ops.h
+++ b/tensorflow/core/kernels/sendrecv_ops.h
@@ -26,6 +26,8 @@ class SendOp : public OpKernel {
   explicit SendOp(OpKernelConstruction* ctx);
   void Compute(OpKernelContext* ctx) override;
 
+  string TraceString(OpKernelContext* ctx, bool verbose) override;
+
  private:
   string key_prefix_;
   Rendezvous::ParsedKey parsed_key_;
@@ -39,6 +41,8 @@ class RecvOp : public AsyncOpKernel {
   explicit RecvOp(OpKernelConstruction* ctx);
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
 
+  string TraceString(OpKernelContext* ctx, bool verbose) override;
+
  private:
   string key_prefix_;
   Rendezvous::ParsedKey parsed_key_;

From 82b1d068026f1fbcb57a02425148ae631f45e054 Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Thu, 20 Feb 2020 10:32:04 -0800
Subject: [PATCH 358/442] Add comment flagging people towards
 Env::LocalTempFilename instead of testing::TmpDir

PiperOrigin-RevId: 296243323
Change-Id: I544c24625d36c5b72c04964c0b9ca5ed3a44fa4c
---
 tensorflow/core/platform/test.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/platform/test.h b/tensorflow/core/platform/test.h
index 5ef6777f583..a2cda11c608 100644
--- a/tensorflow/core/platform/test.h
+++ b/tensorflow/core/platform/test.h
@@ -34,6 +34,8 @@ namespace tensorflow {
 namespace testing {
 
 // Return a temporary directory suitable for temporary testing files.
+//
+// Where possible, consider using Env::LocalTempFilename over this function.
 string TmpDir();
 
 // Returns the path to TensorFlow in the directory containing data

From 8bb742049234d72c28ea22ed86f67f40b288aae8 Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Thu, 20 Feb 2020 11:01:43 -0800
Subject: [PATCH 359/442] Use Env::LocalTempFilename for a temp filename.

This function works both in and outside of tests. Additionally,
LocalTempFilename works well on Windows where as TmpDir is a little problematic
because of bazel oddities.

PiperOrigin-RevId: 296250888
Change-Id: I2a8bc52ad784eda4d00f63c91eec681cc91e16e7
---
 tensorflow/core/lib/io/inputbuffer_test.cc | 32 ++++++++++++++--------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/lib/io/inputbuffer_test.cc b/tensorflow/core/lib/io/inputbuffer_test.cc
index 7ab6105029e..a8d75edc610 100644
--- a/tensorflow/core/lib/io/inputbuffer_test.cc
+++ b/tensorflow/core/lib/io/inputbuffer_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/core/lib/io/inputbuffer.h"
 
 #include <vector>
-#include "tensorflow/core/platform/env.h"
 
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -24,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -37,7 +37,8 @@ static std::vector<int> BufferSizes() {
 
 TEST(InputBuffer, ReadLine_Empty) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/inputbuffer_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, ""));
 
   for (auto buf_size : BufferSizes()) {
@@ -51,7 +52,8 @@ TEST(InputBuffer, ReadLine_Empty) {
 
 TEST(InputBuffer, ReadLine1) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/inputbuffer_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_CHECK_OK(
       WriteStringToFile(env, fname, "line one\nline two\nline three\n"));
 
@@ -74,7 +76,8 @@ TEST(InputBuffer, ReadLine1) {
 
 TEST(InputBuffer, ReadLine_NoTrailingNewLine) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/inputbuffer_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "line one\nline two\nline three"));
 
   for (auto buf_size : BufferSizes()) {
@@ -96,7 +99,8 @@ TEST(InputBuffer, ReadLine_NoTrailingNewLine) {
 
 TEST(InputBuffer, ReadLine_EmptyLines) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/inputbuffer_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_CHECK_OK(
       WriteStringToFile(env, fname, "line one\n\n\nline two\nline three"));
 
@@ -123,7 +127,8 @@ TEST(InputBuffer, ReadLine_EmptyLines) {
 
 TEST(InputBuffer, ReadLine_CRLF) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/inputbuffer_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname,
                                  "line one\r\n\r\n\r\nline two\r\nline three"));
 
@@ -150,7 +155,8 @@ TEST(InputBuffer, ReadLine_CRLF) {
 
 TEST(InputBuffer, ReadNBytes) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/inputbuffer_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   // ReadNBytes(int64, string*).
@@ -223,7 +229,8 @@ TEST(InputBuffer, ReadNBytes) {
 
 TEST(InputBuffer, SkipNBytes) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/inputbuffer_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   for (auto buf_size : BufferSizes()) {
@@ -258,7 +265,8 @@ TEST(InputBuffer, SkipNBytes) {
 
 TEST(InputBuffer, Seek) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/inputbuffer_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
 
   for (auto buf_size : BufferSizes()) {
@@ -293,7 +301,8 @@ TEST(InputBuffer, Seek) {
 
 TEST(InputBuffer, ReadVarint32) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/inputbuffer_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
 
   // Generates data.
   std::vector<uint32> data;
@@ -331,7 +340,8 @@ TEST(InputBuffer, ReadVarint32) {
 
 TEST(InputBuffer, ReadVarint64) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/inputbuffer_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
 
   // Generates data.
   std::vector<uint64> data;

From f446da7fb2c1da0385a70add76c3d140a2b304ba Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Thu, 20 Feb 2020 11:01:47 -0800
Subject: [PATCH 360/442] Use Env::LocalTempFilename for a temp filename.

This function works both in and outside of tests. Additionally,
LocalTempFilename works well on Windows where as TmpDir is a little problematic
because of bazel oddities.

PiperOrigin-RevId: 296250909
Change-Id: I313e4e3467e8f5956c681adb577c70918fe853b6
---
 .../core/lib/io/buffered_inputstream_test.cc  | 39 ++++++++++++-------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/tensorflow/core/lib/io/buffered_inputstream_test.cc b/tensorflow/core/lib/io/buffered_inputstream_test.cc
index ee4e11ac824..c4af1e707b4 100644
--- a/tensorflow/core/lib/io/buffered_inputstream_test.cc
+++ b/tensorflow/core/lib/io/buffered_inputstream_test.cc
@@ -32,7 +32,8 @@ static std::vector<int> BufferSizes() {
 
 TEST(BufferedInputStream, ReadLine_Empty) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/buffered_inputstream_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, ""));
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
@@ -48,7 +49,8 @@ TEST(BufferedInputStream, ReadLine_Empty) {
 
 TEST(BufferedInputStream, ReadLine1) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/buffered_inputstream_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(
       WriteStringToFile(env, fname, "line one\nline two\nline three\n"));
   std::unique_ptr<RandomAccessFile> file;
@@ -73,7 +75,8 @@ TEST(BufferedInputStream, ReadLine1) {
 
 TEST(BufferedInputStream, ReadLine_NoTrailingNewLine) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/buffered_inputstream_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "line one\nline two\nline three"));
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
@@ -97,7 +100,8 @@ TEST(BufferedInputStream, ReadLine_NoTrailingNewLine) {
 
 TEST(BufferedInputStream, ReadLine_EmptyLines) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/buffered_inputstream_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(
       WriteStringToFile(env, fname, "line one\n\n\nline two\nline three"));
   std::unique_ptr<RandomAccessFile> file;
@@ -126,7 +130,8 @@ TEST(BufferedInputStream, ReadLine_EmptyLines) {
 
 TEST(BufferedInputStream, ReadLine_CRLF) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/buffered_inputstream_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname,
                                  "line one\r\n\r\n\r\nline two\r\nline three"));
   std::unique_ptr<RandomAccessFile> file;
@@ -155,7 +160,8 @@ TEST(BufferedInputStream, ReadLine_CRLF) {
 
 TEST(BufferedInputStream, ReadNBytes) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/buffer_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
@@ -192,7 +198,8 @@ TEST(BufferedInputStream, ReadNBytes) {
 
 TEST(BufferedInputStream, SkipNBytes) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/buffered_inputstream_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
@@ -229,7 +236,8 @@ TEST(BufferedInputStream, SkipNBytes) {
 
 TEST(BufferedInputStream, ReadNBytesRandomAccessFile) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/buffer_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
@@ -264,7 +272,8 @@ TEST(BufferedInputStream, ReadNBytesRandomAccessFile) {
 
 TEST(BufferedInputStream, SkipNBytesRandomAccessFile) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/buffered_inputstream_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
@@ -299,7 +308,8 @@ TEST(BufferedInputStream, SkipNBytesRandomAccessFile) {
 
 TEST(BufferedInputStream, Seek) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/buffered_inputstream_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   TF_ASSERT_OK(WriteStringToFile(env, fname, "0123456789"));
   std::unique_ptr<RandomAccessFile> file;
   TF_ASSERT_OK(env->NewRandomAccessFile(fname, &file));
@@ -329,7 +339,8 @@ TEST(BufferedInputStream, Seek) {
 
 TEST(BufferedInputStream, ReadAll_Empty) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/buffered_inputstream_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   const string expected = "";
   TF_ASSERT_OK(WriteStringToFile(env, fname, expected));
   std::unique_ptr<RandomAccessFile> file;
@@ -346,7 +357,8 @@ TEST(BufferedInputStream, ReadAll_Empty) {
 
 TEST(BufferedInputStream, ReadAll_Text) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/buffered_inputstream_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   const string expected = "line one\nline two\nline three";
   TF_ASSERT_OK(WriteStringToFile(env, fname, expected));
   std::unique_ptr<RandomAccessFile> file;
@@ -365,7 +377,8 @@ void BM_BufferedReaderSmallReads(const int iters, const int buff_size,
                                  const int file_size) {
   testing::StopTiming();
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/buffered_inputstream_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
 
   const string file_elem = "0123456789";
   std::unique_ptr<WritableFile> write_file;

From 95436d61253c59f40b46ed8954c3669624888d2e Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Thu, 20 Feb 2020 11:02:29 -0800
Subject: [PATCH 361/442] Make use of JoinPath to build paths to path
 references can work correctly across operating systems.

PiperOrigin-RevId: 296251146
Change-Id: I4db57dfb924ded085d2cb20969193e497100c052
---
 tensorflow/core/platform/BUILD              |  1 +
 tensorflow/core/platform/subprocess_test.cc | 61 +++++++++++++--------
 2 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 1b03357f48e..fb40e56829d 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -914,6 +914,7 @@ tf_cc_test(
         "//tensorflow/core/platform/testdata:test_stderr",
     ],
     deps = [
+        ":path",
         ":resource_loader",
         ":strcat",
         ":subprocess",
diff --git a/tensorflow/core/platform/subprocess_test.cc b/tensorflow/core/platform/subprocess_test.cc
index 97da28dcb4b..e264a04ef68 100644
--- a/tensorflow/core/platform/subprocess_test.cc
+++ b/tensorflow/core/platform/subprocess_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <algorithm>
 
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/platform/strcat.h"
 #include "tensorflow/core/platform/test.h"
@@ -33,15 +34,9 @@ limitations under the License.
 #include <sys/wait.h>
 #endif
 
-const char kEchoProgram[] = "tensorflow/core/platform/testdata/test_echo";
-const char kEchoArgv1Program[] =
-    "tensorflow/core/platform/testdata/test_echo_argv_1";
-const char kNoopProgram[] = "tensorflow/core/platform/testdata/test_noop";
-const char kStdErrProgram[] = "tensorflow/core/platform/testdata/test_stderr";
-
 namespace tensorflow {
-
 namespace {
+
 static string GetDataFilePath(const string& relative_path) {
 #ifdef PLATFORM_WINDOWS
   // While CreateProcess on windows is resilient to not having ".exe" suffix,
@@ -51,20 +46,39 @@ static string GetDataFilePath(const string& relative_path) {
   return GetDataDependencyFilepath(relative_path);
 #endif
 }
-}  // namespace
+
+string EchoProgram() {
+  return io::JoinPath("tensorflow", "core", "platform", "testdata",
+                      "test_echo");
+}
+
+string EchoArgv1Program() {
+  return io::JoinPath("tensorflow", "core", "platform", "testdata",
+                      "test_echo_argv_1");
+}
+
+string NoopProgram() {
+  return io::JoinPath("tensorflow", "core", "platform", "testdata",
+                      "test_noop");
+}
+
+string StdErrProgram() {
+  return io::JoinPath("tensorflow", "core", "platform", "testdata",
+                      "test_stderr");
+}
 
 class SubProcessTest : public ::testing::Test {};
 
 TEST_F(SubProcessTest, NoOutputNoComm) {
   tensorflow::SubProcess proc;
-  proc.SetProgram(GetDataFilePath(kNoopProgram).c_str(), {kNoopProgram});
+  proc.SetProgram(GetDataFilePath(NoopProgram()).c_str(), {NoopProgram()});
   EXPECT_TRUE(proc.Start());
   EXPECT_TRUE(proc.Wait());
 }
 
 TEST_F(SubProcessTest, NoOutput) {
   tensorflow::SubProcess proc;
-  proc.SetProgram(GetDataFilePath(kNoopProgram).c_str(), {kNoopProgram});
+  proc.SetProgram(GetDataFilePath(NoopProgram()).c_str(), {NoopProgram()});
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDERR, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
@@ -80,8 +94,8 @@ TEST_F(SubProcessTest, NoOutput) {
 TEST_F(SubProcessTest, Stdout) {
   tensorflow::SubProcess proc;
   const char test_string[] = "hello_world";
-  proc.SetProgram(GetDataFilePath(kEchoArgv1Program).c_str(),
-                  {kEchoArgv1Program, test_string});
+  proc.SetProgram(GetDataFilePath(EchoArgv1Program()).c_str(),
+                  {EchoArgv1Program(), test_string});
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDERR, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
@@ -97,8 +111,8 @@ TEST_F(SubProcessTest, Stdout) {
 TEST_F(SubProcessTest, StdoutIgnored) {
   tensorflow::SubProcess proc;
   const char test_string[] = "hello_world";
-  proc.SetProgram(GetDataFilePath(kEchoArgv1Program).c_str(),
-                  {kEchoArgv1Program, test_string});
+  proc.SetProgram(GetDataFilePath(EchoArgv1Program()).c_str(),
+                  {EchoArgv1Program(), test_string});
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDERR, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
@@ -111,8 +125,8 @@ TEST_F(SubProcessTest, StdoutIgnored) {
 TEST_F(SubProcessTest, Stderr) {
   tensorflow::SubProcess proc;
   const char test_string[] = "muh_failure!";
-  proc.SetProgram(GetDataFilePath(kStdErrProgram).c_str(),
-                  {kStdErrProgram, test_string});
+  proc.SetProgram(GetDataFilePath(StdErrProgram()).c_str(),
+                  {StdErrProgram(), test_string});
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDERR, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
@@ -128,8 +142,8 @@ TEST_F(SubProcessTest, Stderr) {
 TEST_F(SubProcessTest, StderrIgnored) {
   tensorflow::SubProcess proc;
   const char test_string[] = "muh_failure!";
-  proc.SetProgram(GetDataFilePath(kStdErrProgram).c_str(),
-                  {kStdErrProgram, test_string});
+  proc.SetProgram(GetDataFilePath(StdErrProgram()).c_str(),
+                  {StdErrProgram(), test_string});
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDERR, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
@@ -141,7 +155,7 @@ TEST_F(SubProcessTest, StderrIgnored) {
 
 TEST_F(SubProcessTest, Stdin) {
   tensorflow::SubProcess proc;
-  proc.SetProgram(GetDataFilePath(kEchoProgram).c_str(), {kEchoProgram});
+  proc.SetProgram(GetDataFilePath(EchoProgram()).c_str(), {EchoProgram()});
   proc.SetChannelAction(CHAN_STDIN, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
 
@@ -153,7 +167,7 @@ TEST_F(SubProcessTest, Stdin) {
 
 TEST_F(SubProcessTest, StdinStdout) {
   tensorflow::SubProcess proc;
-  proc.SetProgram(GetDataFilePath(kEchoProgram).c_str(), {kEchoProgram});
+  proc.SetProgram(GetDataFilePath(EchoProgram()).c_str(), {EchoProgram()});
   proc.SetChannelAction(CHAN_STDIN, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
@@ -170,7 +184,7 @@ TEST_F(SubProcessTest, StdinStdout) {
 
 TEST_F(SubProcessTest, StdinChildExit) {
   tensorflow::SubProcess proc;
-  proc.SetProgram(GetDataFilePath(kNoopProgram).c_str(), {kNoopProgram});
+  proc.SetProgram(GetDataFilePath(NoopProgram()).c_str(), {NoopProgram()});
   proc.SetChannelAction(CHAN_STDIN, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
 
@@ -189,7 +203,7 @@ TEST_F(SubProcessTest, StdinChildExit) {
 
 TEST_F(SubProcessTest, StdinStdoutOverlap) {
   tensorflow::SubProcess proc;
-  proc.SetProgram(GetDataFilePath(kEchoProgram).c_str(), {kEchoProgram});
+  proc.SetProgram(GetDataFilePath(EchoProgram()).c_str(), {EchoProgram()});
   proc.SetChannelAction(CHAN_STDIN, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
@@ -213,7 +227,7 @@ TEST_F(SubProcessTest, StdinStdoutOverlap) {
 
 TEST_F(SubProcessTest, KillProc) {
   tensorflow::SubProcess proc;
-  proc.SetProgram(GetDataFilePath(kEchoProgram).c_str(), {kEchoProgram});
+  proc.SetProgram(GetDataFilePath(EchoProgram()).c_str(), {EchoProgram()});
   proc.SetChannelAction(CHAN_STDIN, ACTION_PIPE);
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
@@ -224,4 +238,5 @@ TEST_F(SubProcessTest, KillProc) {
   EXPECT_FALSE(proc.Kill(SIGKILL));
 }
 
+}  // namespace
 }  // namespace tensorflow

From 1fa03ff291cc211730bef3e165a95804599563e8 Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Thu, 20 Feb 2020 11:04:07 -0800
Subject: [PATCH 362/442] Make use of GetDataDependencyFilepath and JoinPath to
 build paths which will work across operating systems.

The previous implementation doesn't work correctly on Windows.

PiperOrigin-RevId: 296251590
Change-Id: Iac5ad8dbf78d06969a51b9476f66e0b8affdaaa4
---
 tensorflow/core/platform/cloud/BUILD              |  1 +
 .../core/platform/cloud/oauth_client_test.cc      | 15 ++++++++-------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 21e826242f9..c28755a6d8c 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -428,6 +428,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:base64",
         "//tensorflow/core/platform:path",
+        "//tensorflow/core/platform:resource_loader",
         "//tensorflow/core/platform:scanner",
         "@boringssl//:crypto",
     ],
diff --git a/tensorflow/core/platform/cloud/oauth_client_test.cc b/tensorflow/core/platform/cloud/oauth_client_test.cc
index 8dfff63873f..babf249f5d6 100644
--- a/tensorflow/core/platform/cloud/oauth_client_test.cc
+++ b/tensorflow/core/platform/cloud/oauth_client_test.cc
@@ -25,13 +25,16 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/http_request_fake.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/platform/scanner.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace {
 
-constexpr char kTestData[] = "core/platform/cloud/testdata/";
+string TestData() {
+  return io::JoinPath("tensorflow", "core", "platform", "cloud", "testdata");
+}
 
 constexpr char kTokenJson[] = R"(
     {
@@ -92,9 +95,8 @@ TEST(OAuthClientTest, GetTokenFromRefreshTokenJson) {
 }
 
 TEST(OAuthClientTest, GetTokenFromServiceAccountJson) {
-  std::ifstream credentials(
-      io::JoinPath(io::JoinPath(testing::TensorFlowSrcRoot(), kTestData),
-                   "service_account_credentials.json"));
+  std::ifstream credentials(GetDataDependencyFilepath(
+      io::JoinPath(TestData(), "service_account_credentials.json")));
   ASSERT_TRUE(credentials.is_open());
   Json::Value json;
   Json::Reader reader;
@@ -135,9 +137,8 @@ TEST(OAuthClientTest, GetTokenFromServiceAccountJson) {
   // Check that 'signature' signs 'header_dot_claim'.
 
   // Read the serialized public key.
-  std::ifstream public_key_stream(
-      io::JoinPath(io::JoinPath(testing::TensorFlowSrcRoot(), kTestData),
-                   "service_account_public_key.txt"));
+  std::ifstream public_key_stream(GetDataDependencyFilepath(
+      io::JoinPath(TestData(), "service_account_public_key.txt")));
   string public_key_serialized(
       (std::istreambuf_iterator<char>(public_key_stream)),
       (std::istreambuf_iterator<char>()));

From eaedb464a0a7e3f20c8d5d1c589ebfb57c3f8792 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Thu, 20 Feb 2020 11:04:20 -0800
Subject: [PATCH 363/442] Remove the main repo reference.

PiperOrigin-RevId: 296251652
Change-Id: Ide83f2957a08b688838d8de7af92fd1cc36369e5
---
 third_party/tensorrt/tensorrt_configure.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
index 6bd71049248..9c980a92cf8 100644
--- a/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -75,7 +75,7 @@ def _create_dummy_repository(repository_ctx):
     # Copy license file in non-remote build.
     repository_ctx.template(
         "LICENSE",
-        Label("@org_tensorflow//third_party/tensorrt:LICENSE"),
+        Label("//third_party/tensorrt:LICENSE"),
         {},
     )
 
@@ -136,7 +136,7 @@ def _create_local_tensorrt_repository(repository_ctx):
     # Copy license file in non-remote build.
     repository_ctx.template(
         "LICENSE",
-        Label("@org_tensorflow//third_party/tensorrt:LICENSE"),
+        Label("//third_party/tensorrt:LICENSE"),
         {},
     )
 

From e972858a29586b8cfa277d1b234129286432f10d Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Thu, 20 Feb 2020 11:04:25 -0800
Subject: [PATCH 364/442] Use Env::LocalTempFilename for a temp filename.

This function works both in and outside of tests. Additionally,
LocalTempFilename works well on Windows where as TmpDir is a little problematic
because of bazel oddities.

PiperOrigin-RevId: 296251680
Change-Id: I985f178e2e85105cf79c4572b9158e168490348c
---
 tensorflow/core/lib/io/zlib_buffers_test.cc | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/lib/io/zlib_buffers_test.cc b/tensorflow/core/lib/io/zlib_buffers_test.cc
index 7e44ac1bb09..34511e5dbbc 100644
--- a/tensorflow/core/lib/io/zlib_buffers_test.cc
+++ b/tensorflow/core/lib/io/zlib_buffers_test.cc
@@ -63,7 +63,8 @@ typedef io::ZlibCompressionOptions CompressionOptions;
 void TestAllCombinations(CompressionOptions input_options,
                          CompressionOptions output_options) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/zlib_buffers_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   for (auto file_size : NumCopies()) {
     // Write to compressed file
     string data = GenTestString(file_size);
@@ -114,7 +115,8 @@ void TestMultipleWrites(uint8 input_buf_size, uint8 output_buf_size,
   CompressionOptions input_options = CompressionOptions::DEFAULT();
   CompressionOptions output_options = CompressionOptions::DEFAULT();
 
-  string fname = testing::TmpDir() + "/zlib_buffers_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   string data = GenTestString();
   std::unique_ptr<WritableFile> file_writer;
   string actual_result;
@@ -162,7 +164,8 @@ TEST(ZlibBuffers, MultipleWriteCallsWithFlush) {
 
 TEST(ZlibInputStream, FailsToReadIfWindowBitsAreIncompatible) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/zlib_buffers_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   CompressionOptions output_options = CompressionOptions::DEFAULT();
   CompressionOptions input_options = CompressionOptions::DEFAULT();
   int input_buf_size = 200, output_buf_size = 200;
@@ -214,7 +217,8 @@ void WriteCompressedFile(Env* env, const string& fname, int input_buf_size,
 void TestTell(CompressionOptions input_options,
               CompressionOptions output_options) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/zlib_buffers_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   for (auto file_size : NumCopies()) {
     string data = GenTestString(file_size);
     for (auto input_buf_size : InputBufferSizes()) {
@@ -258,7 +262,8 @@ void TestTell(CompressionOptions input_options,
 void TestSkipNBytes(CompressionOptions input_options,
                     CompressionOptions output_options) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/zlib_buffers_test";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
   for (auto file_size : NumCopies()) {
     string data = GenTestString(file_size);
     for (auto input_buf_size : InputBufferSizes()) {
@@ -296,7 +301,8 @@ void TestSkipNBytes(CompressionOptions input_options,
 
 void TestSoftErrorOnDecompress(CompressionOptions input_options) {
   Env* env = Env::Default();
-  string fname = testing::TmpDir() + "/garbage_data";
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
 
   input_options.soft_fail_on_error = true;
 

From e379af2b6573c271ed62f989159452f12370b532 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 20 Feb 2020 11:13:35 -0800
Subject: [PATCH 365/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 296254153 Change-Id:
 I475e330a8465070c9d6ee6789f46d0e1ccb9658f

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 449a95765a5..ecdce1e627b 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21345,7 +21345,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22053,7 +22053,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22249,7 +22249,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22318,7 +22318,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22433,7 +22433,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22492,7 +22492,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22666,7 +22666,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22857,7 +22857,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25297,7 +25297,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25629,7 +25629,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25679,7 +25679,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25929,7 +25929,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26559,7 +26559,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27624,7 +27624,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45536,7 +45536,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 93d512f8fe55c791143abd0abecb0e9fe997d28f Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Thu, 20 Feb 2020 11:17:30 -0800
Subject: [PATCH 366/442] Make sure aot codegen test is not confused if windows
 introduces CRLF line endings.

PiperOrigin-RevId: 296255135
Change-Id: I234fe6b76f0cd9ead1d3dc69bf657160d2d910f8
---
 tensorflow/compiler/aot/BUILD           |  1 +
 tensorflow/compiler/aot/codegen_test.cc | 35 ++++++++++++++++++++-----
 2 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index a53d5265459..dfbea9c49eb 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -84,6 +84,7 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:resource_loader",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:support",  # fixdeps: keep
         "@llvm-project//llvm:x86_code_gen",  # fixdeps: keep
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index a7294323d1d..6206f68faf9 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/aot/codegen.h"
 
+#include <algorithm>
 #include <string>
 #include <vector>
 
@@ -29,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -139,23 +141,40 @@ TEST_F(ParseCppClassTest, ParseFail) {
 
 static void CompareWithGoldenFile(
     const string& tensorflow_relative_golden_file_name,
-    const string& expected_contents) {
+    const string& expected_contents, bool ignore_cr) {
+  // Get rid of all CR characters, we may be running under windows.
+  string sanitized_expected_contents(expected_contents);
+  if (ignore_cr) {
+    sanitized_expected_contents.erase(
+        std::remove(sanitized_expected_contents.begin(),
+                    sanitized_expected_contents.end(), '\r'),
+        sanitized_expected_contents.end());
+  }
+
   // To update the golden file, flip update_golden to true and run the
   // following:
   // bazel test --test_strategy=local \
   //   third_party/tensorflow/compiler/aot:codegen_test
   const bool update_golden = false;
-  const string golden_file_name = io::JoinPath(
-      testing::TensorFlowSrcRoot(), tensorflow_relative_golden_file_name);
+  string golden_file_name;
 
   if (update_golden) {
+    golden_file_name = io::JoinPath(testing::TensorFlowSrcRoot(),
+                                    tensorflow_relative_golden_file_name);
     TF_EXPECT_OK(
         WriteStringToFile(Env::Default(), golden_file_name, expected_contents));
   }
 
+  golden_file_name =
+      GetDataDependencyFilepath(tensorflow_relative_golden_file_name);
   string golden_file_contents;
   TF_ASSERT_OK(ReadFileToString(Env::Default(), golden_file_name,
                                 &golden_file_contents));
+  if (ignore_cr) {
+    golden_file_contents.erase(std::remove(golden_file_contents.begin(),
+                                           golden_file_contents.end(), '\r'),
+                               golden_file_contents.end());
+  }
   EXPECT_EQ(golden_file_contents, expected_contents);
 }
 
@@ -229,14 +248,18 @@ TEST(CodegenTest, Golden) {
   // The other fields in metadata_result are tested as part of the generated
   // header test.
 
-  CompareWithGoldenFile("compiler/aot/codegen_test_o.golden",
-                        metadata_result.object_file_data);
+  // This specific golden test checks a binary file. It can potentially run into
+  // issues due to ABIs not being stable, but has not so far.
+  // If we see any ABI issues, we should reconsider this specific test case.
+  CompareWithGoldenFile("tensorflow/compiler/aot/codegen_test_o.golden",
+                        metadata_result.object_file_data, false);
 
   string header;
   TF_ASSERT_OK(
       GenerateHeader(opts, config, compile_result, metadata_result, &header));
 
-  CompareWithGoldenFile("compiler/aot/codegen_test_h.golden", header);
+  CompareWithGoldenFile("tensorflow/compiler/aot/codegen_test_h.golden", header,
+                        true);
 }
 }  // namespace
 }  // namespace tfcompile

From c06dc938d6015180db8b970c2c47fa7dfba8d391 Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Thu, 20 Feb 2020 11:33:44 -0800
Subject: [PATCH 367/442] Remove recursive visibility restriction subsumed by
 broader restriction.

PiperOrigin-RevId: 296259183
Change-Id: Iadb880dae371a95ba8f51e2f0b34d2445dbf1ff6
---
 tensorflow/core/platform/cloud/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index c28755a6d8c..53c4f6cda1f 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -19,7 +19,6 @@ package_group(
     name = "dependency_whitelist",
     packages = [
         "//learning/brain/tfrc/...",
-        "//learning/brain/tfrc/tpu_gcs_file_system/...",
         "//tensorflow/...",
     ],
 )

From 89d0729777e5bc29edb238cb15850398cbad323a Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Thu, 20 Feb 2020 11:53:02 -0800
Subject: [PATCH 368/442] Fix bug where MapAndBatch fusion reverts parallelism
 to 1.

To be affected by this bug, a user would need to either manually set their forward compatibility window past 3/6, or set the recently-added deterministic argument in a call to Dataset.map().

PiperOrigin-RevId: 296263821
Change-Id: I4414719f53d80364503880b784b2389c099dc62b
---
 .../optimizers/data/map_and_batch_fusion.cc   |  2 +
 .../data/map_and_batch_fusion_test.cc         | 85 +++++++++++++++++++
 2 files changed, 87 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
index 56739f9840b..043dfebbb5f 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
@@ -71,6 +71,8 @@ NodeDef MakeMapAndBatchNode(const NodeDef& map_node, const NodeDef& batch_node,
     NodeDef* tmp = graph_utils::AddScalarConstNode<int64>(
         v->attr().at("value").tensor().int_val(0), graph);
     new_node.add_input(tmp->name());
+  } else if (map_node.op() == kParallelMapV2) {
+    new_node.add_input(map_node.input(map_node.input_size() - 1));
   } else {
     NodeDef* tmp = graph_utils::AddScalarConstNode<int64>(1, graph);
     new_node.add_input(tmp->name());
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
index 34e58a52acd..7e9acb1d107 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
@@ -276,6 +276,91 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
                                  batch_node->attr().at("output_types")));
 }
 
+TEST(MapAndBatchFusionTest, FuseParallelMapV2AndBatchNodesIntoOne) {
+  GrapplerItem item;
+  MutableGraphView graph(&item.graph);
+  NodeDef *start_node = graph_utils::AddScalarConstNode<int64>(0, &graph);
+  NodeDef *stop_node = graph_utils::AddScalarConstNode<int64>(10, &graph);
+  NodeDef *step_node = graph_utils::AddScalarConstNode<int64>(1, &graph);
+
+  std::vector<string> range_inputs(3);
+  range_inputs[0] = start_node->name();
+  range_inputs[1] = stop_node->name();
+  range_inputs[2] = step_node->name();
+  std::vector<std::pair<string, AttrValue>> range_attrs;
+  NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                             range_attrs, &graph);
+  NodeDef *captured_input_node =
+      graph_utils::AddScalarConstNode<StringPiece>("hello", &graph);
+  NodeDef *num_parallel_calls_node =
+      graph_utils::AddScalarConstNode<int64>(2, &graph);
+
+  NodeDef *map_node;
+  {
+    std::vector<string> map_inputs(3);
+    map_inputs[0] = range_node->name();
+    map_inputs[1] = captured_input_node->name();
+    map_inputs[2] = num_parallel_calls_node->name();
+    std::vector<std::pair<string, AttrValue>> map_attrs(2);
+    AttrValue f_attr;
+    SetAttrValue("f", &f_attr);
+    map_attrs[0] = std::make_pair("f", f_attr);
+    AttrValue args_attr;
+    SetAttrValue("Targuments", &args_attr);
+    map_attrs[1] = std::make_pair("Targuments", args_attr);
+    map_node = graph_utils::AddNode("", "ParallelMapDatasetV2", map_inputs,
+                                    map_attrs, &graph);
+  }
+
+  NodeDef *batch_size_node = graph_utils::AddScalarConstNode<int64>(5, &graph);
+  NodeDef *batch_node;
+  {
+    std::vector<string> batch_inputs(2);
+    batch_inputs[0] = map_node->name();
+    batch_inputs[1] = batch_size_node->name();
+    std::vector<std::pair<string, AttrValue>> batch_attrs(2);
+    AttrValue shapes_attr;
+    SetAttrValue("output_shapes", &shapes_attr);
+    batch_attrs[0] = std::make_pair("output_shapes", shapes_attr);
+    AttrValue types_attr;
+    SetAttrValue("output_types", &types_attr);
+    batch_attrs[1] = std::make_pair("output_types", types_attr);
+    batch_node = graph_utils::AddNode("", "BatchDataset", batch_inputs,
+                                      batch_attrs, &graph);
+  }
+
+  MapAndBatchFusion optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_FALSE(
+      graph_utils::ContainsGraphNodeWithName(map_node->name(), output));
+  EXPECT_FALSE(
+      graph_utils::ContainsGraphNodeWithName(batch_node->name(), output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDataset", output));
+  NodeDef map_and_batch_node = output.node(
+      graph_utils::FindGraphNodeWithOp("MapAndBatchDataset", output));
+  EXPECT_EQ(map_and_batch_node.input_size(), 5);
+  EXPECT_EQ(map_and_batch_node.input(0), map_node->input(0));
+  EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
+  EXPECT_EQ(map_and_batch_node.input(2), batch_node->input(1));
+  NodeDef num_parallel_calls_node2 = output.node(
+      graph_utils::FindGraphNodeWithName(map_and_batch_node.input(3), output));
+  EXPECT_EQ(num_parallel_calls_node2.attr().at("value").tensor().int64_val(0),
+            2);
+  NodeDef drop_remainder_node = output.node(
+      graph_utils::FindGraphNodeWithName(map_and_batch_node.input(4), output));
+  EXPECT_EQ(drop_remainder_node.attr().at("value").tensor().bool_val(0), false);
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("f"),
+                                 map_node->attr().at("f")));
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("Targuments"),
+                                 map_node->attr().at("Targuments")));
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("output_shapes"),
+                                 batch_node->attr().at("output_shapes")));
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("output_types"),
+                                 batch_node->attr().at("output_types")));
+}
+
 TEST(MapAndBatchFusionTest, NoChange) {
   GrapplerItem item;
   MutableGraphView graph(&item.graph);

From b49ef791067a21579aae63907d62dede813d615e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 20 Feb 2020 11:59:01 -0800
Subject: [PATCH 369/442] Override buffer size when running on cloud tpu. A
 small buffer size adds a performance slowdown. Here we override the buffer
 size to a minimum recommended buffer size.

PiperOrigin-RevId: 296265141
Change-Id: I3a9ca24d9a89a810407ce87cc33bc6f4540ce47a
---
 .../core/kernels/data/tf_record_dataset_op.cc | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tensorflow/core/kernels/data/tf_record_dataset_op.cc b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
index b2a78794d36..8b6658167ea 100644
--- a/tensorflow/core/kernels/data/tf_record_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
@@ -38,6 +38,15 @@ namespace data {
 
 constexpr char kCurrentFileIndex[] = "current_file_index";
 constexpr char kOffset[] = "offset";
+constexpr char kGcsFsPrefix[] = "gs://";
+constexpr int64 kCloudTpuBlockSize = 127LL << 20;  // 127MB.
+
+bool is_cloud_tpu_gcs_fs() {
+#if defined(PLATFORM_CLOUD_TPU) && defined(TPU_GCS_FS)
+  return true;
+#endif
+  return false;
+}
 
 class TFRecordDatasetOp::Dataset : public DatasetBase {
  public:
@@ -224,11 +233,13 @@ void TFRecordDatasetOp::MakeDataset(OpKernelContext* ctx,
       ctx, filenames_tensor->dims() <= 1,
       errors::InvalidArgument("`filenames` must be a scalar or a vector."));
 
+  bool is_gcs_fs = true;
   std::vector<string> filenames;
   filenames.reserve(filenames_tensor->NumElements());
   for (int i = 0; i < filenames_tensor->NumElements(); ++i) {
     VLOG(2) << "Reading file: " << filenames_tensor->flat<tstring>()(i);
     filenames.push_back(filenames_tensor->flat<tstring>()(i));
+    is_gcs_fs &= absl::StartsWith(filenames[i], kGcsFsPrefix);
   }
 
   tstring compression_type;
@@ -242,6 +253,14 @@ void TFRecordDatasetOp::MakeDataset(OpKernelContext* ctx,
               errors::InvalidArgument(
                   "`buffer_size` must be >= 0 (0 == no buffering)"));
 
+  if (is_gcs_fs && is_cloud_tpu_gcs_fs() && buffer_size < kCloudTpuBlockSize) {
+    LOG(WARNING) << "User buffer size is too small for reading Cloud TPU "
+                 << "TFRecords stored in GCS. Overriding " << buffer_size
+                 << " to the minimum recommended buffer_size = "
+                 << kCloudTpuBlockSize;
+    buffer_size = kCloudTpuBlockSize;
+  }
+
   *output =
       new Dataset(ctx, std::move(filenames), compression_type, buffer_size);
 }

From 27da548f1aaefebf56f57eb906848025b9ac9116 Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Thu, 20 Feb 2020 12:00:03 -0800
Subject: [PATCH 370/442] Automated rollback of commit
 a7c9317a40f85bdfd606d10a6f8f3d21325d0f95

PiperOrigin-RevId: 296265367
Change-Id: Ie631071ec9894f297db3f82d43153be11ec249c9
---
 tensorflow/c/eager/c_api_test.cc              | 43 +++++++++++++++----
 .../common_runtime/eager/tensor_handle.cc     | 12 +++---
 2 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 04060b13885..7a089a30164 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -369,7 +369,7 @@ TEST(CAPI, TensorHandleCopyBetweenTwoGPUDevicesAsync) {
 void TensorHandleSilentCopy(bool async,
                             TFE_ContextDevicePlacementPolicy global_policy,
                             TFE_ContextDevicePlacementPolicy thread_policy,
-                            bool cpu_op) {
+                            bool mirror, bool cpu_op) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TFE_ContextOptions* opts = TFE_NewContextOptions();
@@ -392,6 +392,12 @@ void TensorHandleSilentCopy(bool async,
     TFE_TensorHandle* hgpu = TFE_TensorHandleCopyToDevice(
         hcpu, ctx, gpu_device_name.c_str(), status.get());
     ASSERT_EQ(TF_GetCode(status.get()), TF_OK) << TF_Message(status.get());
+    if (mirror) {
+      TFE_TensorHandleEnableImplicitMirroring(hcpu, status.get());
+      ASSERT_EQ(TF_GetCode(status.get()), TF_OK) << TF_Message(status.get());
+      TFE_TensorHandleEnableImplicitMirroring(hgpu, status.get());
+      ASSERT_EQ(TF_GetCode(status.get()), TF_OK) << TF_Message(status.get());
+    }
 
     TFE_Op* matmul = MatMulOp(ctx, hcpu, hgpu);
     if (cpu_op) {
@@ -416,12 +422,23 @@ void TensorHandleSilentCopy(bool async,
                     hgpu->handle.get())
                     ->Handle();
 
-    // The input handles should never change since they have been mirrored.
     auto op = tensorflow::down_cast<tensorflow::OperationInterface*>(
         matmul->operation.get());
-    ASSERT_EQ(op->GetInput(0), arg0);
-    ASSERT_EQ(op->GetInput(1), arg1);
-
+    if (mirror) {
+      // The input handles should never change since they have been mirrored.
+      ASSERT_EQ(op->GetInput(0), arg0);
+      ASSERT_EQ(op->GetInput(1), arg1);
+    } else {
+      if (cpu_op) {
+        ASSERT_EQ(op->GetInput(0), arg0);
+        // The GPU handle should be replaced with a CPU copy
+        ASSERT_NE(op->GetInput(1), arg1);
+      } else {
+        // The CPU handle should be replaced with a GPU copy
+        ASSERT_NE(op->GetInput(0), arg0);
+        ASSERT_EQ(op->GetInput(1), arg1);
+      }
+    }
     TFE_DeleteOp(matmul);
     TFE_DeleteTensorHandle(retvals[0]);
     TFE_DeleteTensorHandle(hgpu);
@@ -437,19 +454,27 @@ void TensorHandleSilentCopy(bool async,
 }
 TEST(CAPI, TensorHandleSilentCopy) {
   TensorHandleSilentCopy(false, TFE_DEVICE_PLACEMENT_SILENT,
-                         TFE_DEVICE_PLACEMENT_SILENT, false);
+                         TFE_DEVICE_PLACEMENT_SILENT, false, false);
 }
 TEST(CAPI, TensorHandleSilentCopyAsync) {
   TensorHandleSilentCopy(true, TFE_DEVICE_PLACEMENT_SILENT,
-                         TFE_DEVICE_PLACEMENT_SILENT, false);
+                         TFE_DEVICE_PLACEMENT_SILENT, false, false);
 }
 TEST(CAPI, TensorHandleSilentCopyLocalPolicy) {
   TensorHandleSilentCopy(false, TFE_DEVICE_PLACEMENT_EXPLICIT,
-                         TFE_DEVICE_PLACEMENT_SILENT, false);
+                         TFE_DEVICE_PLACEMENT_SILENT, false, false);
 }
 TEST(CAPI, TensorHandleSilentCopyLocalPolicyAsync) {
   TensorHandleSilentCopy(true, TFE_DEVICE_PLACEMENT_EXPLICIT,
-                         TFE_DEVICE_PLACEMENT_SILENT, false);
+                         TFE_DEVICE_PLACEMENT_SILENT, false, false);
+}
+TEST(CAPI, TensorHandleMirrorCopy) {
+  TensorHandleSilentCopy(false, TFE_DEVICE_PLACEMENT_SILENT,
+                         TFE_DEVICE_PLACEMENT_SILENT, true, false);
+}
+TEST(CAPI, TensorHandleMirrorCopyCpu) {
+  TensorHandleSilentCopy(false, TFE_DEVICE_PLACEMENT_SILENT,
+                         TFE_DEVICE_PLACEMENT_SILENT, true, true);
 }
 
 void SetAndGetOpDevices(bool async) {
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index ef2b3104ed8..0a4d3bd8120 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -143,7 +143,7 @@ TensorHandle::TensorHandle(std::unique_ptr<LocalTensorHandleData> t,
       ctx_(ctx),
       is_remote_(false),
       is_async_(false),
-      implicit_mirroring_(true),
+      implicit_mirroring_(false),
       is_ready_(true),
       tensor_handle_data_(std::move(t)) {
   DVLOG(3) << "Creating Local TensorHandle: " << this
@@ -164,7 +164,7 @@ TensorHandle::TensorHandle(std::unique_ptr<LocalTensorHandleData> t,
       ctx_(ctx),
       is_remote_(false),
       is_async_(false),
-      implicit_mirroring_(true),
+      implicit_mirroring_(false),
       is_ready_(true),
       handle_dtypes_and_shapes_(resource_handle.dtypes_and_shapes()),
       tensor_handle_data_(std::move(t)) {
@@ -185,7 +185,7 @@ TensorHandle::TensorHandle(std::unique_ptr<LocalTensorHandleData> t,
       ctx_(ctx),
       is_remote_(false),
       is_async_(false),
-      implicit_mirroring_(true),
+      implicit_mirroring_(false),
       is_ready_(true),
       tensor_handle_data_(std::move(t)) {
   // TODO(allenl): Figure out a better op_device story for custom devices,
@@ -220,7 +220,7 @@ TensorHandle::TensorHandle(std::unique_ptr<EmptyLocalTensorHandleData> t,
       ctx_(ctx),
       is_remote_(false),
       is_async_(async),
-      implicit_mirroring_(true),
+      implicit_mirroring_(false),
       is_ready_(!async),
       tensor_handle_data_(std::move(t)) {
   DVLOG(3) << "Creating empty Local TensorHandle: " << this
@@ -261,7 +261,7 @@ TensorHandle::TensorHandle(std::unique_ptr<RemoteTensorHandleData> t,
       ctx_(ctx),
       is_remote_(true),
       is_async_(false),
-      implicit_mirroring_(true),
+      implicit_mirroring_(false),
       is_ready_(true),
       tensor_handle_data_(std::move(t)) {
   DVLOG(3) << "Creating Remote TensorHandle: " << this
@@ -298,7 +298,7 @@ TensorHandle::TensorHandle(std::unique_ptr<UnshapedRemoteTensorHandleData> t,
       ctx_(ctx),
       is_remote_(true),
       is_async_(true),
-      implicit_mirroring_(true),
+      implicit_mirroring_(false),
       is_ready_(false),
       tensor_handle_data_(std::move(t)) {
   DVLOG(3) << "Creating Unshaped Remote TensorHandle: " << this

From d284478b0074c48f2e15cf51d5d99837f433de5d Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <prakalps@google.com>
Date: Thu, 20 Feb 2020 12:17:43 -0800
Subject: [PATCH 371/442] NFC: Use OpRewritePattern in xla_hlo to std
 legalization. PiperOrigin-RevId: 296269235 Change-Id:
 I8a58df72db49531993e384e041d2d7f5a14648bb

---
 .../xla/transforms/legalize_to_standard.cc    | 50 ++++++++-----------
 1 file changed, 20 insertions(+), 30 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard.cc
index 5e12abc466c..9720d2abd8e 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard.cc
@@ -24,12 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
 #include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
 
-using mlir::Builder;
-using mlir::FunctionPass;
-using mlir::OpPassBase;
-using mlir::OwningRewritePatternList;
-using mlir::PassRegistration;
-
 namespace mlir {
 namespace {
 #include "tensorflow/compiler/mlir/xla/transforms/generated_legalize_to_standard.inc"
@@ -37,16 +31,14 @@ namespace {
 namespace xla_hlo {
 namespace {
 
-struct CompareIConvert : public RewritePattern {
-  explicit CompareIConvert(MLIRContext *context)
-      : RewritePattern("xla_hlo.compare", 1, context) {}
+class CompareIConvert : public OpRewritePattern<xla_hlo::CompareOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
 
-  PatternMatchResult matchAndRewrite(Operation *op,
+  PatternMatchResult matchAndRewrite(xla_hlo::CompareOp op,
                                      PatternRewriter &rewriter) const override {
-    auto compare_op = cast<CompareOp>(op);
-
-    auto lhs = compare_op.lhs();
-    auto rhs = compare_op.rhs();
+    auto lhs = op.lhs();
+    auto rhs = op.rhs();
     auto lhs_type = lhs.getType().cast<TensorType>();
     auto rhs_type = rhs.getType().cast<TensorType>();
 
@@ -57,7 +49,7 @@ struct CompareIConvert : public RewritePattern {
         !rhs_type.getElementType().isa<IntegerType>())
       return matchFailure();
 
-    auto comparison_direction = compare_op.comparison_direction();
+    auto comparison_direction = op.comparison_direction();
     auto compare_predicate =
         llvm::StringSwitch<Optional<CmpIPredicate>>(comparison_direction)
             .Case("EQ", CmpIPredicate::eq)
@@ -76,16 +68,14 @@ struct CompareIConvert : public RewritePattern {
   }
 };
 
-struct CompareFConvert : public RewritePattern {
-  explicit CompareFConvert(MLIRContext *context)
-      : RewritePattern("xla_hlo.compare", 1, context) {}
+class CompareFConvert : public OpRewritePattern<xla_hlo::CompareOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
 
-  PatternMatchResult matchAndRewrite(Operation *op,
+  PatternMatchResult matchAndRewrite(xla_hlo::CompareOp op,
                                      PatternRewriter &rewriter) const override {
-    auto compare_op = cast<CompareOp>(op);
-
-    auto lhs = compare_op.lhs();
-    auto rhs = compare_op.rhs();
+    auto lhs = op.lhs();
+    auto rhs = op.rhs();
     auto lhs_type = lhs.getType().cast<TensorType>();
     auto rhs_type = rhs.getType().cast<TensorType>();
 
@@ -96,7 +86,7 @@ struct CompareFConvert : public RewritePattern {
         !rhs_type.getElementType().isa<FloatType>())
       return matchFailure();
 
-    auto comparison_direction = compare_op.comparison_direction();
+    auto comparison_direction = op.comparison_direction();
     CmpFPredicate compare_predicate =
         llvm::StringSwitch<CmpFPredicate>(comparison_direction)
             .Case("EQ", CmpFPredicate::OEQ)
@@ -116,8 +106,6 @@ struct CompareFConvert : public RewritePattern {
 };
 
 }  // end anonymous namespace
-}  // end namespace xla_hlo
-}  // end namespace mlir
 
 namespace {
 struct LegalizeToStandard : public FunctionPass<LegalizeToStandard> {
@@ -126,13 +114,12 @@ struct LegalizeToStandard : public FunctionPass<LegalizeToStandard> {
 };
 }  // end anonymous namespace
 
-std::unique_ptr<mlir::OpPassBase<mlir::FuncOp>>
-mlir::xla_hlo::createLegalizeToStdPass() {
+std::unique_ptr<mlir::OpPassBase<mlir::FuncOp>> createLegalizeToStdPass() {
   return std::make_unique<LegalizeToStandard>();
 }
 
-void mlir::xla_hlo::PopulateXlaToStdPatterns(OwningRewritePatternList *patterns,
-                                             mlir::MLIRContext *ctx) {
+void PopulateXlaToStdPatterns(OwningRewritePatternList *patterns,
+                              mlir::MLIRContext *ctx) {
   mlir::populateWithGenerated(ctx, patterns);
   patterns
       ->insert<mlir::xla_hlo::CompareFConvert, mlir::xla_hlo::CompareIConvert>(
@@ -148,3 +135,6 @@ void LegalizeToStandard::runOnFunction() {
 
 static PassRegistration<LegalizeToStandard> legalize_pass(
     "xla-legalize-to-std", "Legalize from XLA dialect to standard dialect");
+
+}  // end namespace xla_hlo
+}  // end namespace mlir

From 18ebe7538ff1c1450de9fbf3bfbfe72bdf0605dc Mon Sep 17 00:00:00 2001
From: Thomas O'Malley <omalleyt@google.com>
Date: Thu, 20 Feb 2020 12:19:03 -0800
Subject: [PATCH 372/442] Add tests to make sure optimizer weights are being
 saved correctly.

PiperOrigin-RevId: 296269488
Change-Id: I714119a593c8846faf253aa94ef060b141e4ad84
---
 tensorflow/python/keras/saving/save_test.py | 33 +++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/tensorflow/python/keras/saving/save_test.py b/tensorflow/python/keras/saving/save_test.py
index 602c3cdd359..965a1b88cc7 100644
--- a/tensorflow/python/keras/saving/save_test.py
+++ b/tensorflow/python/keras/saving/save_test.py
@@ -213,6 +213,39 @@ class TestSaveModel(test.TestCase):
                         rnn_layers[1].kernel.name)
     self.assertIn('rnn_cell1', rnn_layers[1].kernel.name)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_saving_optimizer_weights(self):
+
+    class MyModel(keras.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.layer = keras.layers.Dense(1)
+
+      def call(self, x):
+        return self.layer(x)
+
+    path = os.path.join(self.get_temp_dir(), 'weights_path')
+    x, y = np.ones((10, 10)), np.ones((10, 1))
+
+    model = MyModel()
+    model.compile('rmsprop', loss='bce')
+    model.train_on_batch(x, y)
+    model.reset_metrics()
+    model.save_weights(path, save_format='tf')
+
+    batch_loss = model.train_on_batch(x, y)
+
+    new_model = MyModel()
+    new_model.compile('rmsprop', loss='bce')
+    new_model.train_on_batch(x, y)
+    new_model.reset_metrics()
+
+    new_model.load_weights(path)
+    new_batch_loss = new_model.train_on_batch(x, y)
+
+    self.assertAllClose(batch_loss, new_batch_loss)
+
 
 if __name__ == '__main__':
   test.main()

From b80e0db32f986d9cf78862afe7a74e323053cc3b Mon Sep 17 00:00:00 2001
From: HyoukJoong Lee <hyouklee@google.com>
Date: Thu, 20 Feb 2020 12:27:24 -0800
Subject: [PATCH 373/442] Replace replicated cross-replica AR with global AR
 with division

PiperOrigin-RevId: 296271095
Change-Id: I19aed2a6ef74cd26d18151b449e4fe98d34ce8ba
---
 tensorflow/compiler/xla/service/BUILD         |  1 +
 .../compiler/xla/service/ar_crs_combiner.cc   | 66 ++++++++++++++++++-
 .../xla/service/ar_crs_combiner_test.cc       | 38 ++++++++++-
 3 files changed, 102 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index bb6219eb584..2e3d1fd9ea6 100755
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -4347,6 +4347,7 @@ cc_library(
         ":call_graph",
         ":hlo",
         ":hlo_pass",
+        ":hlo_query",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.cc b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
index ec8c391a542..dae9589e0a9 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/hlo_replication_analysis.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -32,6 +33,60 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
+namespace {
+
+// In SPMD mode, if there's a cross-replica all-reduce that produces the same
+// value for all partitions, replaces it with a global all-reduce and then
+// divide by the number of partitions. Depending on the topology and the
+// implementation of the all-reduce for the backend, this may give a better
+// performance.
+StatusOr<bool> ReplaceReplicatedAllReduce(HloModule* module,
+                                          int64 replica_count,
+                                          int64 partition_count) {
+  TF_ASSIGN_OR_RETURN(
+      auto replication_analysis,
+      HloReplicationAnalysis::Run(module, /*cross_partition_spmd=*/true));
+
+  bool changed = false;
+  int64 next_channel = hlo_query::NextChannelId(*module);
+  for (auto computation : module->computations()) {
+    for (auto instruction : computation->instructions()) {
+      if (auto ar = DynCast<HloAllReduceInstruction>(instruction)) {
+        const Shape& shape = ar->shape();
+        if (ar->channel_id()) {
+          continue;
+        }
+        if (ar->replica_groups().size() > 1) {
+          continue;
+        }
+        if (shape.IsTuple() || shape.element_type() != F32) {
+          continue;
+        }
+        // We would need a cost model for the target, but in general we want to
+        // rewrite only if the replica count in the original op was large.
+        if (replica_count < 8 * partition_count) {
+          continue;
+        }
+        if (replication_analysis->HloInstructionIsReplicatedAt(ar, {})) {
+          VLOG(2) << "Replaced replicated all-reduce:" << ar->ToString();
+          ar->set_channel_id(next_channel++);
+          auto divisor =
+              computation->AddInstruction(HloInstruction::CreateConstant(
+                  LiteralUtil::CreateR0<float>(partition_count)));
+          auto bcast = computation->AddInstruction(
+              HloInstruction::CreateBroadcast(shape, divisor, {}));
+          auto div = computation->AddInstruction(HloInstruction::CreateBinary(
+              ar->shape(), HloOpcode::kDivide, ar, bcast));
+          TF_RETURN_IF_ERROR(ar->ReplaceAllUsesWith(div));
+          changed = true;
+        }
+      }
+    }
+  }
+  return changed;
+}
+
+}  // namespace
 
 namespace m = match;
 
@@ -508,7 +563,16 @@ StatusOr<bool> ArCrsCombiner::Run(HloModule* module) {
     TF_RETURN_IF_ERROR(KeepProvablyEqualInstructionGroupsMPMD());
   }
 
-  return RewriteGraph();
+  TF_ASSIGN_OR_RETURN(auto changed, RewriteGraph());
+
+  if (num_replicas_ > 1 && spmd_partition_) {
+    TF_ASSIGN_OR_RETURN(auto replaced,
+                        ReplaceReplicatedAllReduce(module, num_replicas_,
+                                                   num_spatial_partitions_));
+    changed |= replaced;
+  }
+
+  return changed;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
index 609da2c33a0..2aaac4f2344 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
@@ -1711,9 +1711,9 @@ HloModule foobar
 
 ENTRY %entrycomp (p: bf16[]) -> (f32[]) {
   %p = bf16[] parameter(0)
-  %all-reduce.0 = f32[] all-reduce(%p), channel_id=1, replica_groups={{0,1}},
+  %all-reduce.0 = f32[] all-reduce(%p), channel_id=1, replica_groups={{0},{1}},
     to_apply=%sum.f32
-  %all-reduce.2 = f32[] all-reduce(%all-reduce.0), replica_groups={{0,1}},
+  %all-reduce.2 = f32[] all-reduce(%all-reduce.0), replica_groups={{0},{1}},
     to_apply=%sum.f32
   ROOT %tuple = (f32[]) tuple(%all-reduce.2)
 }
@@ -1727,5 +1727,39 @@ ENTRY %entrycomp (p: bf16[]) -> (f32[]) {
   EXPECT_FALSE(changed);
 }
 
+TEST_F(ArCrsCombinerTest, ReplaceReplicatedAllReduceSPMD) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum.f32 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[2,4]) -> f32[2,4] {
+  %p = f32[2,4] parameter(0), sharding={replicated}
+  ROOT %all-reduce = f32[2,4] all-reduce(%p), replica_groups={{0,1}},
+    to_apply=%sum.f32
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  ArCrsCombiner combiner(/*num_spatial_partitions=*/4, /*num_replicas=*/64,
+                         /*spmd_partition=*/true);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Divide(op::AllReduce(op::Parameter()),
+                               op::Broadcast(op::Constant())));
+
+  auto ar = root->operand(0);
+  auto divisor = root->operand(1)->operand(0);
+  EXPECT_TRUE(ar->channel_id());
+  EXPECT_TRUE(divisor->literal().IsAllFloat(4));
+}
+
 }  // namespace
 }  // namespace xla

From 44e0c1ab8495e417d02db4895ca40fbfb3121409 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Thu, 20 Feb 2020 12:42:53 -0800
Subject: [PATCH 374/442] Test both CollectiveAllReduce NCCL and RING

PiperOrigin-RevId: 296274147
Change-Id: I61f6f445bed2829707f5f7126d0a1f765227496c
---
 .../distribute/cross_device_ops_test.py       | 68 ++++++++++++++-----
 1 file changed, 51 insertions(+), 17 deletions(-)

diff --git a/tensorflow/python/distribute/cross_device_ops_test.py b/tensorflow/python/distribute/cross_device_ops_test.py
index c91ec38bfd1..216cb8aba23 100644
--- a/tensorflow/python/distribute/cross_device_ops_test.py
+++ b/tensorflow/python/distribute/cross_device_ops_test.py
@@ -432,6 +432,8 @@ class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase,
 
 NUM_WORKERS = 3
 
+CollectiveCommunication = cross_device_ops_lib.CollectiveCommunication
+
 
 class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
                               CrossDeviceOpsTestBase):
@@ -454,6 +456,7 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
                         task_type,
                         task_id,
                         num_gpus=0,
+                        communication=CollectiveCommunication.AUTO,
                         use_strategy_object=False,
                         local_mode=False,
                         num_packs=1):
@@ -469,15 +472,23 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
         devices = ["/device:CPU:0"]
 
       if use_strategy_object:
-        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy()
+        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+            communication=communication)
         strategy.extended._collective_keys = collective_keys
         strategy.extended._cross_device_ops._collective_keys = collective_keys
         return strategy, devices, ""
       else:
         collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
-            1, num_gpus, collective_keys=collective_keys, num_packs=num_packs)
+            1,
+            num_gpus,
+            collective_keys=collective_keys,
+            num_packs=num_packs,
+            communication=communication)
         return collective_all_reduce_ops, devices, ""
     else:
+      # NCCL requires physical GPUs for every replica, which we can't do with
+      # simulated multi host set up now.
+      assert communication != CollectiveCommunication.NCCL
       if num_gpus:
         devices = [
             "/job:%s/task:%d/replica:0/device:GPU:%d" % (task_type, task_id, i)
@@ -489,7 +500,8 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
         ]
 
       if use_strategy_object:
-        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy()
+        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+            communication=communication)
         strategy.configure(
             cluster_spec=self._cluster_spec,
             task_type=task_type,
@@ -500,8 +512,11 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
                 "grpc://" + self._cluster_spec[task_type][task_id])
       else:
         collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
-            NUM_WORKERS, num_gpus, collective_keys=collective_keys,
-            num_packs=num_packs)
+            NUM_WORKERS,
+            num_gpus,
+            collective_keys=collective_keys,
+            num_packs=num_packs,
+            communication=communication)
         return (collective_all_reduce_ops, devices,
                 "grpc://" + self._cluster_spec[task_type][task_id])
 
@@ -509,6 +524,7 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
                       task_type,
                       task_id,
                       num_gpus,
+                      communication,
                       use_strategy_object=False,
                       local_mode=False,
                       num_packs=1):
@@ -516,6 +532,7 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
         task_type,
         task_id,
         num_gpus,
+        communication=communication,
         use_strategy_object=use_strategy_object,
         local_mode=local_mode,
         num_packs=num_packs)
@@ -645,11 +662,16 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
                                   task_type,
                                   task_id,
                                   num_gpus,
+                                  communication,
                                   batch_reduce,
                                   variable_length,
                                   local_mode=False):
     collective_all_reduce, devices, master_target = self._get_test_objects(
-        task_type, task_id, num_gpus, local_mode=local_mode)
+        task_type,
+        task_id,
+        num_gpus,
+        communication=communication,
+        local_mode=local_mode)
     if local_mode:
       num_workers = 1
       worker_device = None
@@ -704,6 +726,7 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
         self._test_reduction,
         self._cluster_spec,
         required_gpus,
+        communication=CollectiveCommunication.RING,
         use_strategy_object=use_strategy_object,
         num_packs=num_packs)
 
@@ -711,25 +734,32 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
       combinations.combine(
           mode=["graph"],
           required_gpus=[0, 1, 2],
-          batch_reduce=[True],
           variable_length=[True, False]))
-  def testReduceIndexedSlicesDistributed(self, required_gpus, batch_reduce,
-                                         variable_length):
-    self._run_between_graph_clients(self._test_reduce_indexed_slices,
-                                    self._cluster_spec, required_gpus,
-                                    batch_reduce, variable_length)
+  def testReduceIndexedSlicesDistributed(self, required_gpus, variable_length):
+    self._run_between_graph_clients(
+        self._test_reduce_indexed_slices,
+        self._cluster_spec,
+        required_gpus,
+        communication=CollectiveCommunication.RING,
+        batch_reduce=True,
+        variable_length=variable_length)
 
   # Collective ops doesn't support strategy with one device.
   @combinations.generate(
       combinations.combine(
           mode=["graph"],
           required_gpus=2,
+          communication=[
+              CollectiveCommunication.NCCL, CollectiveCommunication.RING
+          ],
           use_strategy_object=[True, False]))
-  def testReductionLocal(self, required_gpus, use_strategy_object):
+  def testReductionLocal(self, required_gpus, communication,
+                         use_strategy_object):
     self._test_reduction(
         None,
         None,
         required_gpus,
+        communication=communication,
         use_strategy_object=use_strategy_object,
         local_mode=True)
 
@@ -738,15 +768,19 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
           mode=["graph"],
           required_gpus=2,
           batch_reduce=[True, False],
-          variable_length=[True, False]))
+          variable_length=[True, False],
+          communication=[
+              CollectiveCommunication.NCCL, CollectiveCommunication.RING
+          ]))
   def testReduceIndexedSlicesLocal(self, required_gpus, batch_reduce,
-                                   variable_length):
+                                   variable_length, communication):
     self._test_reduce_indexed_slices(
         None,
         None,
         required_gpus,
-        batch_reduce,
-        variable_length,
+        communication=communication,
+        batch_reduce=batch_reduce,
+        variable_length=variable_length,
         local_mode=True)
 
 

From 302b017a5bd976f66356b8892b03c4cddffc31b1 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Thu, 20 Feb 2020 12:53:51 -0800
Subject: [PATCH 375/442] [tf.data] Add pre-requisite check for `padded_batch`.

PiperOrigin-RevId: 296276337
Change-Id: I068975d1320e688a2826114137149f66ab37b4e0
---
 .../data/kernel_tests/padded_batch_test.py      | 17 +++++++++++++----
 tensorflow/python/data/ops/dataset_ops.py       | 12 +++++++-----
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/data/kernel_tests/padded_batch_test.py b/tensorflow/python/data/kernel_tests/padded_batch_test.py
index beec8c3bd6b..e42da988989 100644
--- a/tensorflow/python/data/kernel_tests/padded_batch_test.py
+++ b/tensorflow/python/data/kernel_tests/padded_batch_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
@@ -224,12 +225,20 @@ class PaddedBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
   @combinations.generate(test_base.default_test_combinations())
   def testPaddedBatchSparseError(self):
 
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
+    st = sparse_tensor.SparseTensorValue(
+        indices=[[0, 0]], values=([42]), dense_shape=[1, 1])
 
     with self.assertRaises(TypeError):
-      _ = dataset_ops.Dataset.range(10).map(_map_fn).padded_batch(10)
+      _ = dataset_ops.Dataset.from_tensors(st).repeat(10).padded_batch(10)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testPaddedBatchRaggedError(self):
+
+    rt = ragged_tensor_value.RaggedTensorValue(
+        np.array([0, 42]), np.array([0, 2], dtype=np.int64))
+
+    with self.assertRaises(TypeError):
+      _ = dataset_ops.Dataset.from_tensors(rt).repeat(10).padded_batch(10)
 
   @combinations.generate(test_base.default_test_combinations())
   def testPaddedBatchShapeErrorWrongRank(self):
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 4b25eb3a273..3e104793ca3 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -39,7 +39,6 @@ from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import options as options_lib
 from tensorflow.python.data.util import random_seed
-from tensorflow.python.data.util import sparse
 from tensorflow.python.data.util import structure
 from tensorflow.python.data.util import traverse
 from tensorflow.python.eager import context
@@ -3857,10 +3856,13 @@ class PaddedBatchDataset(UnaryDataset):
                drop_remainder):
     """See `Dataset.batch()` for details."""
     self._input_dataset = input_dataset
-    if sparse.any_sparse(get_legacy_output_classes(input_dataset)):
-      # TODO(b/63669786): support batching of sparse tensors
-      raise TypeError(
-          "Batching of padded sparse tensors is not currently supported")
+
+    def check_types(component_spec):
+      if not isinstance(component_spec, tensor_spec.TensorSpec):
+        raise TypeError("Padded batching of components of type ",
+                        type(component_spec), " is not supported.")
+
+    nest.map_structure(check_types, input_dataset.element_spec)
     self._input_dataset = input_dataset
     self._batch_size = ops.convert_to_tensor(
         batch_size, dtype=dtypes.int64, name="batch_size")

From 52726fcd101c902bf8de705d5f02b13a45a19d5f Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Thu, 20 Feb 2020 13:11:02 -0800
Subject: [PATCH 376/442] Formatting changes to support tooling.

PiperOrigin-RevId: 296279985
Change-Id: I3b983f245ac4ede32bceac8c609b521a4a7fc22b
---
 tensorflow/python/BUILD | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 15d21d34bc5..25fc7c199a1 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -2147,9 +2147,7 @@ tf_py_test(
 
 tf_gen_op_wrapper_private_py(
     name = "functional_ops_gen",
-    visibility = [
-        "//learning/brain/python/ops:__pkg__",
-    ],
+    visibility = ["//learning/brain/python/ops:__pkg__"],
 )
 
 py_library(
@@ -2860,9 +2858,7 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "parsing_ops_gen",
-    visibility = [
-        "//learning/brain/python/ops:__pkg__",
-    ],
+    visibility = ["//learning/brain/python/ops:__pkg__"],
 )
 
 tf_gen_op_wrapper_private_py(

From d5fd0b2931c85ac39e79e1eaceaabfb2d83c5db8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 20 Feb 2020 13:17:08 -0800
Subject: [PATCH 377/442] Fix eager:core_test test indeterminism.

One of the tests in the suite left the context inside a CPU device
context. This made other tests fail depending on the order in which
they were executed. It also caused some tests to /pass/ because they
required that device context to be (accidentally) active.

PiperOrigin-RevId: 296281260
Change-Id: I6a2dcb4bd566591c3d198d1e2b2c2545af108916
---
 tensorflow/python/eager/core_test.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index 8993efd4085..aabd350a3ce 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -63,7 +63,7 @@ def truncated_normal(shape):
 
 
 def current_device():
-  return constant_op.constant(1.).device
+  return array_ops.identity(1.).device
 
 
 def configure_virtual_cpus():
@@ -394,20 +394,22 @@ class TFETest(test_util.TensorFlowTestCase):
   def testMultiCpuPlacement(self):
     with ops.device('cpu:1'):
       x = constant_op.constant(1.0)
-    y = array_ops.identity(x)
+    with ops.device('cpu:0'):
+      y = array_ops.identity(x)
     self.assertEqual(x.device, '/job:localhost/replica:0/task:0/device:CPU:1')
     self.assertEqual(y.device, '/job:localhost/replica:0/task:0/device:CPU:0')
 
   @test_util.run_gpu_only
   def testShouldCopy(self):
-    with ops.device('gpu:0'):
-      x = constant_op.constant(1.0)
+    with ops.device('GPU:0'):
+      x = array_ops.identity(1.0)
+      self.assertEqual(x.device, '/job:localhost/replica:0/task:0/device:GPU:0')
     y = array_ops.identity(x)
     # The value we're testing y.device against will depend on what the behavior
     # of not explicitly specifying a device in the context is.  This behavior is
     # subject to change (for example, in the future we may want to use GPUs, if
     # available, when no device is explicitly provided)
-    self.assertEqual(y.device, '/job:localhost/replica:0/task:0/device:CPU:0')
+    self.assertEqual(y.device, current_device())
 
   def testContextSwitchStackContainsEagerMode(self):
     # Eager execution has been enabled, and no other context switch has
@@ -488,6 +490,7 @@ class TFETest(test_util.TensorFlowTestCase):
       self.assertEndsWith(current_device(), 'GPU:0')
     gpu.__exit__()
     self.assertEndsWith(current_device(), 'CPU:0')
+    cpu.__exit__()
 
   @test_util.run_gpu_only
   def testReEntrant(self):
@@ -563,12 +566,14 @@ class TFETest(test_util.TensorFlowTestCase):
     def simple_fn(unused_handle):
       return 1.
 
+    with ops.device('CPU:0'):
+      test_var = variables.Variable([2., 3.])
+
     @def_function.function
     def test_fn(v):
       script_ops.eager_py_func(simple_fn, [v.handle], dtypes.float32)
       return 1.
 
-    test_var = variables.Variable([2., 3.])
     self.assertAllEqual(test_fn(test_var), 1.0)
 
   def testPyFunctionAsync(self):
@@ -1014,7 +1019,8 @@ class TFETest(test_util.TensorFlowTestCase):
       t.join()
 
   def testEmptyResourceReturned(self):
-    v = variables.Variable(1.)
+    with ops.device('CPU:0'):
+      v = variables.Variable(1.)
     empty_handle = array_ops.gather(
         v.handle[array_ops.newaxis], array_ops.zeros([0], dtype=dtypes.int32))
     self.assertEqual(

From e42390c8ce6098b7383890696208f4619064c8a6 Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Thu, 20 Feb 2020 13:18:29 -0800
Subject: [PATCH 378/442] Formatting changes to make tooling easier.

PiperOrigin-RevId: 296281482
Change-Id: I736d812a7fcdc05695367776cb4177649c69cffc
---
 tensorflow/compiler/xla/service/BUILD | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 2e3d1fd9ea6..da50e92de32 100755
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -27,9 +27,7 @@ package_group(
     includes = [
         "//tensorflow/compiler/xla:friends",
     ],
-    packages = [
-        "//learning/brain/experimental/tf_runtime/...",
-    ],
+    packages = ["//learning/brain/experimental/tf_runtime/..."],
 )
 
 tf_proto_library_cc(

From 681ddbe2bd305bbf62dce2340779d8f9e6c969f4 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Thu, 20 Feb 2020 13:22:23 -0800
Subject: [PATCH 379/442] [MLIR] Use PassOptions in LhloFuseLinalg pass instead
 of LLVM CL opts.

PiperOrigin-RevId: 296282382
Change-Id: I1c4223c40e8816259984bd2971c4f535e3735830
---
 tensorflow/compiler/mlir/xla/BUILD            |  1 +
 .../mlir/xla/tests/lhlo-fuse-linalg.mlir      |  4 +-
 .../mlir/xla/transforms/lhlo_fuse_linalg.cc   | 43 +++++++++++--------
 3 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index d3b7215d26d..df3ffd0599c 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -204,6 +204,7 @@ cc_library(
     deps = [
         ":lhlo",
         "@com_google_absl//absl/memory",
+        "@llvm-project//llvm:support",
         "@llvm-project//mlir:LinalgOps",
         "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:Pass",
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
index a9ffc116392..7f7e37ebe66 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
@@ -1,6 +1,6 @@
 // RUN: tf-opt -lhlo-fuse-linalg %s -o - | FileCheck %s --dump-input=always
-// RUN: tf-opt -lhlo-fuse-linalg -tile-sizes-for-linalg-fusion=2,3 %s -o - | FileCheck %s -check-prefix=TILED --dump-input-on-failure
-// RUN: tf-opt -lhlo-fuse-linalg -tile-to-parallel-loops-for-linalg-fusion %s -o - | FileCheck %s -check-prefix=PLOOP --dump-input-on-failure
+// RUN: tf-opt -lhlo-fuse-linalg=tile-sizes=2,3 %s -o - | FileCheck %s -check-prefix=TILED --dump-input-on-failure
+// RUN: tf-opt -lhlo-fuse-linalg=use-parallel-loops %s -o - | FileCheck %s -check-prefix=PLOOP --dump-input-on-failure
 
 
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
index 6b2b548550a..a52d2318ba7 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
@@ -18,31 +18,26 @@ limitations under the License.
 
 #include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
 #include "absl/memory/memory.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"  // TF:llvm-project
 #include "mlir/Pass/Pass.h"  // TF:llvm-project
 #include "mlir/Transforms/FoldUtils.h"  // TF:llvm-project
 
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> tile_to_parallel_loops_for_linalg_fusion(
-    "tile-to-parallel-loops-for-linalg-fusion",
-    llvm::cl::desc(
-        "Tiles GenericOp consumer to parallel loops before linalg fusion"),
-    llvm::cl::init(false));
-
-// NOLINTNEXTLINE
-static llvm::cl::list<unsigned> tile_sizes_for_linalg_fusion(
-    "tile-sizes-for-linalg-fusion",
-    llvm::cl::desc(
-        "Tile sizes by which to tile linalg generic before linalg fusion"),
-    llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated);
-
 namespace mlir {
 namespace xla_lhlo {
 namespace {
 
 using linalg::LinalgOp;
 
-struct LhloFuseLinalg : public FunctionPass<LhloFuseLinalg> {
+class LhloFuseLinalg : public FunctionPass<LhloFuseLinalg> {
+ public:
+  LhloFuseLinalg() = default;
+  LhloFuseLinalg(const LhloFuseLinalg&) {}
+  LhloFuseLinalg(bool use_parallel_loops, llvm::ArrayRef<unsigned> tile_sizes) {
+    tile_sizes_->assign(tile_sizes.begin(), tile_sizes.end());
+    use_parallel_loops_.setValue(use_parallel_loops);
+  }
+
   void runOnFunction() override {
     auto func = getFunction();
 
@@ -64,8 +59,8 @@ struct LhloFuseLinalg : public FunctionPass<LhloFuseLinalg> {
     OpBuilder b(func);
     OperationFolder folder(func.getContext());
     func.walk([&](linalg::GenericOp generic_op) {
-      SmallVector<int64_t, 2> tile_sizes(tile_sizes_for_linalg_fusion.begin(),
-                                         tile_sizes_for_linalg_fusion.end());
+      SmallVector<int64_t, 2> tile_sizes(tile_sizes_.begin(),
+                                         tile_sizes_.end());
       if (tile_sizes.empty()) {
         tile_sizes =
             SmallVector<int64_t, 2>(generic_op.getNumInputsAndOutputs(), 1);
@@ -105,13 +100,25 @@ struct LhloFuseLinalg : public FunctionPass<LhloFuseLinalg> {
   bool tileGenericOp(LinalgOp op, ArrayRef<int64_t> tile_sizes, OpBuilder* b,
                      OperationFolder* folder) {
     auto tiled_generic_op =
-        tile_to_parallel_loops_for_linalg_fusion
+        use_parallel_loops_
             ? linalg::tileLinalgOpToParallelLoops(*b, op, tile_sizes,
                                                   /*permutation=*/{}, folder)
             : linalg::tileLinalgOp(*b, op, tile_sizes,
                                    /*permutation=*/{}, folder);
     return tiled_generic_op.hasValue();
   }
+
+  Option<bool> use_parallel_loops_{
+      *this, "use-parallel-loops",
+      llvm::cl::desc(
+          "Tiles GenericOp consumer to parallel loops before linalg fusion"),
+      llvm::cl::init(false)};
+
+  ListOption<unsigned> tile_sizes_{
+      *this, "tile-sizes",
+      llvm::cl::desc(
+          "Tile sizes by which to tile linalg generic before linalg fusion"),
+      llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated};
 };
 
 }  // namespace

From f952bb1ccbe4d83e2a3a8398c96bd18a59f0915a Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <prakalps@google.com>
Date: Thu, 20 Feb 2020 13:25:27 -0800
Subject: [PATCH 380/442] Move Iota folding to xla_hlo-std legalization.

Iota op folding can lead to huge constants. Not every target would like to fold iota ops which increases the file size. Moving it to xla_hlo to standard legalization which was the original intent behind adding this fold.

PiperOrigin-RevId: 296282985
Change-Id: I71cb1679796ff0d36251ddb2f4cd0fce8aa75192
---
 tensorflow/compiler/mlir/xla/ir/hlo_ops.cc    | 25 --------
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td    |  2 -
 .../compiler/mlir/xla/tests/canonicalize.mlir |  8 +++
 tensorflow/compiler/mlir/xla/tests/iota.mlir  | 61 -------------------
 .../mlir/xla/tests/legalize-to-std.mlir       | 48 +++++++++++++++
 .../xla/transforms/legalize_to_standard.cc    | 39 +++++++++++-
 6 files changed, 92 insertions(+), 91 deletions(-)
 delete mode 100644 tensorflow/compiler/mlir/xla/tests/iota.mlir

diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index 481c12b42c2..41ef8690735 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -177,31 +177,6 @@ void ConstOp::build(Builder* builder, OperationState& result, Attribute value) {
 // IotaOp
 //===----------------------------------------------------------------------===//
 
-OpFoldResult IotaOp::fold(ArrayRef<Attribute> operands) {
-  const auto output_type = getResult().getType().cast<ShapedType>();
-  const auto output_size = output_type.getNumElements();
-  const auto dimension = iota_dimension().getSExtValue();
-  const auto max_dim_size = output_type.getDimSize(dimension);
-  int bitwidth = output_type.getElementType().getIntOrFloatBitWidth();
-
-  llvm::SmallVector<APInt, 10> values;
-  values.reserve(output_size);
-
-  int64_t increase_stride = output_size;
-  for (int i = 0; i <= dimension; i++) {
-    increase_stride /= output_type.getDimSize(i);
-  }
-
-  int64_t current_value = 0;
-  for (int i = 0; i < output_size; i++) {
-    int64_t value = (current_value / increase_stride) % max_dim_size;
-    values.push_back(APInt(bitwidth, value));
-    ++current_value;
-  }
-
-  return DenseIntElementsAttr::get(output_type, values);
-}
-
 static LogicalResult Verify(IotaOp op) {
   auto shape = op.getType().cast<ShapedType>();
   if (!shape.hasRank()) return success();
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index 28c0a859f7d..269e1cc8897 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -122,8 +122,6 @@ def HLO_IotaOp : HLO_Op<"iota", [NoSideEffect]>, BASE_HLO_IotaOp {
 
   let results = (outs HLO_IntFpOrComplexTensor:$output);
 
-  let hasFolder = 1;
-
   // TODO(b/130357376): Iota has special conversion logic to HLO.
   let hasCustomHLOConverter = 1;
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
index fa39b77918a..2232063fd6a 100644
--- a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
@@ -49,6 +49,14 @@ func @complex_collapse_fold(%arg0: tensor<4xcomplex<f32>>) -> tensor<4xcomplex<f
   return %2 : tensor<4xcomplex<f32>>
 }
 
+// CHECK-LABEL: @iota_not_lowered_to_constant
+func @iota_not_lowered_to_constant() -> tensor<4xi32> {
+  // CHECK: [[RESULT:%.*]] = "xla_hlo.iota"
+  // CHECK: return [[RESULT]]
+  %0 = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<4xi32>
+  return %0 : tensor<4xi32>
+}
+
 // CHECK-LABEL: @unary_einsum
 func @unary_einsum(%arg0: tensor<2x3xf32>) -> tensor<2x2xf32> {
   // CHECK: %[[ONE:.*]] = xla_hlo.constant dense<1.000000e+00> : tensor<f32>
diff --git a/tensorflow/compiler/mlir/xla/tests/iota.mlir b/tensorflow/compiler/mlir/xla/tests/iota.mlir
deleted file mode 100644
index 65b9f73ba67..00000000000
--- a/tensorflow/compiler/mlir/xla/tests/iota.mlir
+++ /dev/null
@@ -1,61 +0,0 @@
-// RUN: tf-opt %s -split-input-file -xla-legalize-to-std | FileCheck %s
-
-// -----
-
-// CHECK-LABEL: func @iota.const.1() -> tensor<4xi32> {
-func @iota.const.1() -> tensor<4xi32> {
-  // CHECK-NEXT: %[[CST:.*]] = constant dense<[0, 1, 2, 3]> : tensor<4xi32>
-  %0 = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<4xi32>
-  // CHECK-NEXT: return %[[CST]] : tensor<4xi32>
-  return %0 : tensor<4xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @iota.const.2() -> tensor<2x4xi32> {
-func @iota.const.2() -> tensor<2x4xi32> {
-  // CHECK-NEXT: %[[CST:.*]] = constant dense<{{\[\[}}0, 0, 0, 0], [1, 1, 1, 1]]> : tensor<2x4xi32>
-  %0 = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2x4xi32>
-  // CHECK-NEXT: return %[[CST]] : tensor<2x4xi32>
-  return %0 : tensor<2x4xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @iota.const.3() -> tensor<2x4xi32> {
-func @iota.const.3() -> tensor<2x4xi32> {
-  // CHECK-NEXT: %[[CST:.*]] = constant dense<{{\[\[}}0, 1, 2, 3], [0, 1, 2, 3]]> : tensor<2x4xi32>
-  %0 = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<2x4xi32>
-  // CHECK-NEXT: return %[[CST]] : tensor<2x4xi32>
-  return %0 : tensor<2x4xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @iota.const.4() -> tensor<2x3x4xi32> {
-func @iota.const.4() -> tensor<2x3x4xi32> {
-  // CHECK-NEXT: %[[CST:.*]] = constant dense<{{\[\[\[}}0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0{{\]\]}}, {{\[\[}}1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]> : tensor<2x3x4xi32>
-  %0 = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2x3x4xi32>
-  // CHECK-NEXT: return %[[CST]] : tensor<2x3x4xi32>
-  return %0 : tensor<2x3x4xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @iota.const.5() -> tensor<2x3x4xi32> {
-func @iota.const.5() -> tensor<2x3x4xi32> {
-  // CHECK-NEXT: %[[CST:.*]] = constant dense<{{\[\[\[}}0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2{{\]\]}}, {{\[\[}}0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]]]> : tensor<2x3x4xi32>
-  %0 = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<2x3x4xi32>
-  // CHECK-NEXT: return %[[CST]] : tensor<2x3x4xi32>
-  return %0 : tensor<2x3x4xi32>
-}
-
-// -----
-
-// CHECK-LABEL: func @iota.const.6() -> tensor<2x3x4xi32> {
-func @iota.const.6() -> tensor<2x3x4xi32> {
-  // CHECK-NEXT: %[[CST:.*]] = constant dense<{{\[\[\[}}0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3{{\]\]}}, {{\[\[}}0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]]]> : tensor<2x3x4xi32>
-  %0 = "xla_hlo.iota"() {iota_dimension = 2 : i64} : () -> tensor<2x3x4xi32>
-  // CHECK-NEXT: return %[[CST]] : tensor<2x3x4xi32>
-  return %0 : tensor<2x3x4xi32>
-}
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir
index 1d2cf767939..f56174ae075 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir
@@ -135,3 +135,51 @@ func @float_constant() -> (tensor<f32>, tensor<2x3xf32>, tensor<2x3xf32>) {
   return %0, %1, %2: tensor<f32>, tensor<2x3xf32>, tensor<2x3xf32>
 }
 
+// Test Iota lowering to constant
+// CHECK-LABEL: func @iota.const.1() -> tensor<4xi32> {
+func @iota.const.1() -> tensor<4xi32> {
+  // CHECK-NEXT: %[[CST:.*]] = constant dense<[0, 1, 2, 3]> : tensor<4xi32>
+  %0 = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<4xi32>
+  // CHECK-NEXT: return %[[CST]] : tensor<4xi32>
+  return %0 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @iota.const.2() -> tensor<2x4xi32> {
+func @iota.const.2() -> tensor<2x4xi32> {
+  // CHECK-NEXT: %[[CST:.*]] = constant dense<{{\[\[}}0, 0, 0, 0], [1, 1, 1, 1]]> : tensor<2x4xi32>
+  %0 = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2x4xi32>
+  // CHECK-NEXT: return %[[CST]] : tensor<2x4xi32>
+  return %0 : tensor<2x4xi32>
+}
+
+// CHECK-LABEL: func @iota.const.3() -> tensor<2x4xi32> {
+func @iota.const.3() -> tensor<2x4xi32> {
+  // CHECK-NEXT: %[[CST:.*]] = constant dense<{{\[\[}}0, 1, 2, 3], [0, 1, 2, 3]]> : tensor<2x4xi32>
+  %0 = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<2x4xi32>
+  // CHECK-NEXT: return %[[CST]] : tensor<2x4xi32>
+  return %0 : tensor<2x4xi32>
+}
+
+// CHECK-LABEL: func @iota.const.4() -> tensor<2x3x4xi32> {
+func @iota.const.4() -> tensor<2x3x4xi32> {
+  // CHECK-NEXT: %[[CST:.*]] = constant dense<{{\[\[\[}}0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0{{\]\]}}, {{\[\[}}1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]> : tensor<2x3x4xi32>
+  %0 = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<2x3x4xi32>
+  // CHECK-NEXT: return %[[CST]] : tensor<2x3x4xi32>
+  return %0 : tensor<2x3x4xi32>
+}
+
+// CHECK-LABEL: func @iota.const.5() -> tensor<2x3x4xi32> {
+func @iota.const.5() -> tensor<2x3x4xi32> {
+  // CHECK-NEXT: %[[CST:.*]] = constant dense<{{\[\[\[}}0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2{{\]\]}}, {{\[\[}}0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]]]> : tensor<2x3x4xi32>
+  %0 = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<2x3x4xi32>
+  // CHECK-NEXT: return %[[CST]] : tensor<2x3x4xi32>
+  return %0 : tensor<2x3x4xi32>
+}
+
+// CHECK-LABEL: func @iota.const.6() -> tensor<2x3x4xi32> {
+func @iota.const.6() -> tensor<2x3x4xi32> {
+  // CHECK-NEXT: %[[CST:.*]] = constant dense<{{\[\[\[}}0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3{{\]\]}}, {{\[\[}}0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]]]> : tensor<2x3x4xi32>
+  %0 = "xla_hlo.iota"() {iota_dimension = 2 : i64} : () -> tensor<2x3x4xi32>
+  // CHECK-NEXT: return %[[CST]] : tensor<2x3x4xi32>
+  return %0 : tensor<2x3x4xi32>
+}
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard.cc
index 9720d2abd8e..5ee6010c3a8 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard.cc
@@ -105,6 +105,41 @@ class CompareFConvert : public OpRewritePattern<xla_hlo::CompareOp> {
   }
 };
 
+class ConvertIotaOp : public OpRewritePattern<xla_hlo::IotaOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(xla_hlo::IotaOp op,
+                                     PatternRewriter &rewriter) const override {
+    auto output_type = op.getType().cast<ShapedType>();
+    // TODO(prakalps): Handle FP and ComplexType iota ops.
+    if (!output_type.getElementType().isa<IntegerType>()) return matchFailure();
+    auto output_size = output_type.getNumElements();
+    auto dimension = op.iota_dimension().getSExtValue();
+    auto max_dim_size = output_type.getDimSize(dimension);
+    int bitwidth = output_type.getElementType().getIntOrFloatBitWidth();
+
+    llvm::SmallVector<APInt, 10> values;
+    values.reserve(output_size);
+
+    int64_t increase_stride = output_size;
+    for (int i = 0; i <= dimension; i++) {
+      increase_stride /= output_type.getDimSize(i);
+    }
+
+    int64_t current_value = 0;
+    for (int i = 0; i < output_size; i++) {
+      int64_t value = (current_value / increase_stride) % max_dim_size;
+      values.push_back(APInt(bitwidth, value));
+      ++current_value;
+    }
+
+    rewriter.replaceOpWithNewOp<mlir::ConstantOp>(
+        op, DenseIntElementsAttr::get(output_type, values));
+    return matchSuccess();
+  }
+};
+
 }  // end anonymous namespace
 
 namespace {
@@ -121,9 +156,7 @@ std::unique_ptr<mlir::OpPassBase<mlir::FuncOp>> createLegalizeToStdPass() {
 void PopulateXlaToStdPatterns(OwningRewritePatternList *patterns,
                               mlir::MLIRContext *ctx) {
   mlir::populateWithGenerated(ctx, patterns);
-  patterns
-      ->insert<mlir::xla_hlo::CompareFConvert, mlir::xla_hlo::CompareIConvert>(
-          ctx);
+  patterns->insert<CompareFConvert, CompareIConvert, ConvertIotaOp>(ctx);
 }
 
 /// Perform the lowering to standard dialect.

From 5fc1ad961b83dd36941aa2b447a4a602b622e2c9 Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Thu, 20 Feb 2020 13:27:14 -0800
Subject: [PATCH 381/442] Use resource_loader to reference in-tree resources.

PiperOrigin-RevId: 296283401
Change-Id: I8531d318a672c1d496f334932ec7355b1d343adf
---
 tensorflow/python/kernel_tests/decode_image_op_test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/decode_image_op_test.py b/tensorflow/python/kernel_tests/decode_image_op_test.py
index ba5770001ad..58678a404b4 100644
--- a/tensorflow/python/kernel_tests/decode_image_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_image_op_test.py
@@ -27,9 +27,10 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import io_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
+from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
 
-prefix_path = "tensorflow/core/lib"
+prefix_path = resource_loader.get_path_to_datafile("../../core/lib")
 
 
 class DecodeImageOpTest(test.TestCase):

From ebf01547f5f76ae0c65e708c09d60aa8e06c30a9 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Thu, 20 Feb 2020 13:46:49 -0800
Subject: [PATCH 382/442] Temporarily disable mkl_dequantize_op_test

PiperOrigin-RevId: 296287811
Change-Id: Icc89d8cf863eef95ba3e4c1b7098bcb6327a9494
---
 tensorflow/core/kernels/BUILD | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index e42de02b979..b6af5ccc3e2 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -7990,6 +7990,11 @@ tf_cc_test_mkl(
     name = "mkl_dequantize_op_test",
     size = "small",
     srcs = ["mkl_dequantize_op_test.cc"],
+    # TODO(b/149940073): Re-enable.
+    tags = [
+        "no_oss",
+        "notap",
+    ],
     deps = [
         ":mkl_dequantize_op",
         ":mkl_tfconv_op",

From 006060f4230cd1386a554931273f85cac668b0f2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 20 Feb 2020 14:14:13 -0800
Subject: [PATCH 383/442] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 296294694 Change-Id:
 I57dcfe38ff03af3cf7987554f5553b66f04716db

---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index ecdce1e627b..449a95765a5 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21345,7 +21345,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22053,7 +22053,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22249,7 +22249,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22318,7 +22318,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22433,7 +22433,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22492,7 +22492,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22666,7 +22666,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22857,7 +22857,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25297,7 +25297,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25629,7 +25629,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25679,7 +25679,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25929,7 +25929,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26559,7 +26559,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27624,7 +27624,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45536,7 +45536,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 554f16e9701a34399f45617ea90675f30d30e0d0 Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Thu, 20 Feb 2020 14:19:06 -0800
Subject: [PATCH 384/442] Make use of GetDataDependencyFilepath and JoinPath to
 build paths which will work across operating systems.

The previous implementation doesn't work correctly on Windows.

PiperOrigin-RevId: 296295721
Change-Id: I1d4d067a5c938cfd6c1ce8724bb9f49ea89a4bda
---
 tensorflow/cc/saved_model/BUILD          |  1 +
 tensorflow/cc/saved_model/reader_test.cc | 30 +++++++++++++-----------
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index e680cc72b3b..882b4032f76 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -68,6 +68,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/platform:resource_loader",
     ],
 )
 
diff --git a/tensorflow/cc/saved_model/reader_test.cc b/tensorflow/cc/saved_model/reader_test.cc
index e898664c221..bc630bcaede 100644
--- a/tensorflow/cc/saved_model/reader_test.cc
+++ b/tensorflow/cc/saved_model/reader_test.cc
@@ -21,15 +21,22 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace {
 
-constexpr char kTestDataPbTxt[] =
-    "cc/saved_model/testdata/half_plus_two_pbtxt/00000123";
-constexpr char kTestDataSharded[] =
-    "cc/saved_model/testdata/half_plus_two/00000123";
+string TestDataPbTxt() {
+  return io::JoinPath("tensorflow", "cc", "saved_model", "testdata",
+                      "half_plus_two_pbtxt", "00000123");
+}
+
+string TestDataSharded() {
+  return io::JoinPath("tensorflow", "cc", "saved_model", "testdata",
+                      "half_plus_two", "00000123");
+}
 
 class ReaderTest : public ::testing::Test {
  protected:
@@ -49,8 +56,7 @@ class ReaderTest : public ::testing::Test {
 TEST_F(ReaderTest, TagMatch) {
   MetaGraphDef meta_graph_def;
 
-  const string export_dir =
-      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  const string export_dir = GetDataDependencyFilepath(TestDataSharded());
   TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(export_dir, {kSavedModelTagServe},
                                               &meta_graph_def));
   CheckMetaGraphDef(meta_graph_def);
@@ -59,8 +65,7 @@ TEST_F(ReaderTest, TagMatch) {
 TEST_F(ReaderTest, NoTagMatch) {
   MetaGraphDef meta_graph_def;
 
-  const string export_dir =
-      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  const string export_dir = GetDataDependencyFilepath(TestDataSharded());
   Status st = ReadMetaGraphDefFromSavedModel(export_dir, {"missing-tag"},
                                              &meta_graph_def);
   EXPECT_FALSE(st.ok());
@@ -73,8 +78,7 @@ TEST_F(ReaderTest, NoTagMatch) {
 TEST_F(ReaderTest, NoTagMatchMultiple) {
   MetaGraphDef meta_graph_def;
 
-  const string export_dir =
-      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataSharded);
+  const string export_dir = GetDataDependencyFilepath(TestDataSharded());
   Status st = ReadMetaGraphDefFromSavedModel(
       export_dir, {kSavedModelTagServe, "missing-tag"}, &meta_graph_def);
   EXPECT_FALSE(st.ok());
@@ -87,8 +91,7 @@ TEST_F(ReaderTest, NoTagMatchMultiple) {
 TEST_F(ReaderTest, PbtxtFormat) {
   MetaGraphDef meta_graph_def;
 
-  const string export_dir =
-      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPbTxt);
+  const string export_dir = GetDataDependencyFilepath(TestDataPbTxt());
   TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(export_dir, {kSavedModelTagServe},
                                               &meta_graph_def));
   CheckMetaGraphDef(meta_graph_def);
@@ -97,8 +100,7 @@ TEST_F(ReaderTest, PbtxtFormat) {
 TEST_F(ReaderTest, InvalidExportPath) {
   MetaGraphDef meta_graph_def;
 
-  const string export_dir =
-      io::JoinPath(testing::TensorFlowSrcRoot(), "missing-path");
+  const string export_dir = GetDataDependencyFilepath("missing-path");
   Status st = ReadMetaGraphDefFromSavedModel(export_dir, {kSavedModelTagServe},
                                              &meta_graph_def);
   EXPECT_FALSE(st.ok());

From 340b8e47745416936f43d9f13d6ea02753f61a68 Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Thu, 20 Feb 2020 14:35:21 -0800
Subject: [PATCH 385/442] Fix error string.

PiperOrigin-RevId: 296299506
Change-Id: I17388239c97ccb4ece0fe04c33e0a650f185670f
---
 tensorflow/python/data/ops/iterator_ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index d3fa08ffddf..668af74acf6 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -571,8 +571,8 @@ class OwnedIterator(trackable.Trackable, composite_tensor.CompositeTensor):
         `components` and `element_spec` is provided.
     """
 
-    error_message = "Either `dataset` or both `components` and "
-    "`element_spec` need to be provided."
+    error_message = ("Either `dataset` or both `components` and "
+                     "`element_spec` need to be provided.")
 
     self._device = context.context().device_name
 

From 8001bbff8b2f80b1089bbb6cff384ca353443b60 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 20 Feb 2020 14:35:34 -0800
Subject: [PATCH 386/442] [TF:MLIR] Make Conv2D layout sensitive operation and
 update LayoutSensitiveInterface

PiperOrigin-RevId: 296299559
Change-Id: I14810117004b724f0d054885a9d4fef45195128a
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    |  10 +-
 .../mlir/tensorflow/ir/tf_op_interfaces.td    |   6 +
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     | 174 +++++++++++++-----
 ...layout_optimization_layout_assignment.mlir |  36 +++-
 .../transforms/layout_optimization.cc         |  22 +--
 5 files changed, 175 insertions(+), 73 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 31e85ef247e..191e0afbdee 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -510,6 +510,7 @@ Broadcasting is supported, so `value` may have any number of dimensions.
     // TF_LayoutSensitiveInterface:
     SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
     SmallVector<unsigned, 4> GetLayoutDependentResults() { return {0}; }
+    LogicalResult UpdateDataFormat(StringRef data_format);
   }];
 }
 
@@ -980,7 +981,7 @@ tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
   let hasCanonicalizer = 1;
 }
 
-def TF_Conv2DOp : TF_Op<"Conv2D", [NoSideEffect]> {
+def TF_Conv2DOp : TF_Op<"Conv2D", [NoSideEffect, TF_LayoutSensitiveInterface]> {
   let summary = [{
 Computes a 2-D convolution given 4-D `input` and `filter` tensors.
   }];
@@ -1030,6 +1031,13 @@ horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
   let verifier = [{
     return Verify(*this);
   }];
+
+  let extraClassDeclaration = [{
+    // TF_LayoutSensitiveInterface:
+    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
+    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {0}; }
+    LogicalResult UpdateDataFormat(StringRef data_format);
+  }];
 }
 
 def TF_Conv2DBackpropFilterOp : TF_Op<"Conv2DBackpropFilter", [NoSideEffect]> {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
index 8700247af43..cc0819d71c9 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
@@ -50,6 +50,12 @@ def TF_LayoutSensitiveInterface : OpInterface<"LayoutSensitiveInterface"> {
       [{Returns indices of layout dependent results.}],
       "SmallVector<unsigned, 4>", "GetLayoutDependentResults", (ins)
     >,
+    InterfaceMethod<
+      [{Updates operation attributes and operands to account for the updated
+        data format. If data format is not supported, must return failure.}],
+      "LogicalResult", "UpdateDataFormat",
+      (ins "StringRef":$data_format)
+    >,
   ];
 
   let verify = [{
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 57e16d91d69..e7c554d03a0 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -151,26 +151,6 @@ static bool AreCastCompatible(Type a, Type b) {
          b_kind == TensorFlowTypes::VARIANT;
 }
 
-static bool AreCancellablePermutations(DenseIntElementsAttr perm0,
-                                       DenseIntElementsAttr perm1) {
-  if (perm0.getNumElements() == 0 || perm1.getNumElements() == 0) return false;
-  if (perm0.getNumElements() != perm1.getNumElements()) return false;
-
-  SmallVector<int64_t, 8> perm0_values;
-  for (auto value : perm0.getIntValues())
-    perm0_values.push_back(value.getSExtValue());
-
-  SmallVector<int64_t, 8> perm1_values;
-  for (auto value : perm1.getIntValues())
-    perm1_values.push_back(value.getSExtValue());
-
-  for (int i = 0; i < perm0_values.size(); ++i) {
-    if (perm0_values[perm1_values[i]] != i) return false;
-  }
-
-  return true;
-}
-
 static bool IsUnknownDimOrRank(int64_t dim_or_rank) {
   return dim_or_rank == -1;
 }
@@ -312,6 +292,99 @@ static LogicalResult VerifyTypesCompatibility(
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// TF op helper functions to work with layout transformation.
+//===----------------------------------------------------------------------===//
+
+SmallVector<int64_t, 4> GetDataFormatPermutation(StringRef from, StringRef to) {
+  if (from == "NHWC" && to == "NCHW") {
+    return {0, 3, 1, 2};
+  } else if (from == "NCHW" && to == "NHWC") {
+    return {0, 1, 2, 3};
+  } else {
+    return {};
+  }
+}
+
+// Shuffle elements in the `attr` according to the permutation. Optional
+// `inner_size` allows to shuffle array attributes created from rank 2 tensors
+// on outer dimension only.
+ArrayAttr ShuffleArrayAttr(ArrayAttr attr, ArrayRef<int64_t> permutation,
+                           int inner_size = 1) {
+  if (attr.size() == 0) return attr;
+
+  assert(attr.size() % inner_size == 0);
+  assert(attr.size() / inner_size == permutation.size());
+
+  SmallVector<Attribute, 8> values{attr.begin(), attr.end()};
+  SmallVector<Attribute, 8> shuffled(values.size());
+
+  for (size_t i = 0; i < permutation.size(); ++i) {
+    for (size_t j = 0; j < inner_size; ++j) {
+      shuffled[i * inner_size + j] = values[permutation[i] * inner_size + j];
+    }
+  }
+
+  return ArrayAttr::get(shuffled, attr.getContext());
+}
+
+// Shuffle ranked tensor dimensions according to the permutation.
+Type ShuffleRankedTensorType(Type type, ArrayRef<int64_t> permutation) {
+  if (auto ranked_type = type.dyn_cast<RankedTensorType>()) {
+    ArrayRef<int64_t> shape = ranked_type.getShape();
+    assert(permutation.size() == shape.size());
+
+    SmallVector<int64_t, 4> new_shape(permutation.size());
+    for (size_t i = 0; i < permutation.size(); ++i)
+      new_shape[i] = shape[permutation[i]];
+
+    return RankedTensorType::get(new_shape, ranked_type.getElementType());
+  }
+
+  return type;
+}
+
+static bool AreCancellablePermutations(DenseIntElementsAttr perm0,
+                                       DenseIntElementsAttr perm1) {
+  if (perm0.getNumElements() == 0 || perm1.getNumElements() == 0) return false;
+  if (perm0.getNumElements() != perm1.getNumElements()) return false;
+
+  SmallVector<int64_t, 8> perm0_values;
+  for (auto value : perm0.getIntValues())
+    perm0_values.push_back(value.getSExtValue());
+
+  SmallVector<int64_t, 8> perm1_values;
+  for (auto value : perm1.getIntValues())
+    perm1_values.push_back(value.getSExtValue());
+
+  for (int i = 0; i < perm0_values.size(); ++i) {
+    if (perm0_values[perm1_values[i]] != i) return false;
+  }
+
+  return true;
+}
+
+// Default implementation of `LayoutSensitiveInterface::UpdateDataFormat` for
+// layout sensitive operations that do not have any additional layout dependent
+// attributes besides `data_format` string.
+template <typename Op>
+LogicalResult UpdateDataFormat(StringRef data_format, Op *op) {
+  auto perm = GetDataFormatPermutation(op->data_format(), data_format);
+  if (perm.empty()) return failure();
+
+  // Update data format attribute.
+  op->setAttr("data_format", StringAttr::get(data_format, op->getContext()));
+
+  // Update types for all layout sensitive results.
+  auto layout_sensitive = cast<LayoutSensitiveInterface>(op->getOperation());
+  for (unsigned idx : layout_sensitive.GetLayoutDependentResults()) {
+    OpResult result = op->getOperation()->getResult(idx);
+    result.setType(ShuffleRankedTensorType(result.getType(), perm));
+  }
+
+  return success();
+}
+
 namespace {
 #include "tensorflow/compiler/mlir/tensorflow/transforms/generated_canonicalize.inc"
 }  // namespace
@@ -479,6 +552,10 @@ static LogicalResult Verify(BiasAddOp op) {
   return success();
 }
 
+LogicalResult BiasAddOp::UpdateDataFormat(StringRef data_format) {
+  return ::mlir::TF::UpdateDataFormat(data_format, this);
+}
+
 //===----------------------------------------------------------------------===//
 // BiasAddGradOp
 //===----------------------------------------------------------------------===//
@@ -837,6 +914,21 @@ static LogicalResult Verify(OpT op) {
   return success();
 }
 
+LogicalResult Conv2DOp::UpdateDataFormat(StringRef data_format) {
+  auto perm = GetDataFormatPermutation(this->data_format(), data_format);
+  if (perm.empty()) return failure();
+
+  // Update data_format attribute and result types.
+  if (failed(::mlir::TF::UpdateDataFormat(data_format, this))) return failure();
+
+  // Update convolution attributes.
+  setAttr("dilations", ShuffleArrayAttr(dilations(), perm));
+  setAttr("strides", ShuffleArrayAttr(strides(), perm));
+  setAttr("explicit_paddings", ShuffleArrayAttr(explicit_paddings(), perm, 2));
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // Conv2dBackpropInputOp
 //===----------------------------------------------------------------------===//
@@ -1358,53 +1450,33 @@ LogicalResult MaxPoolOp::FoldOperandsPermutation(
     ArrayRef<int64_t> permutation) {
   MLIRContext *context = getParentOfType<ModuleOp>().getContext();
 
+  // Data format after folding permutation.
+  StringRef target_data_format;
+
   // For now we only support folding of NCHW->NHWC and NHWC->NCHW permutations.
   if (data_format() == "NHWC") {
     static constexpr std::array<int64_t, 4> kPerm = {0, 2, 3, 1};  // to NHWC
     if (permutation != ArrayRef<int64_t>(kPerm)) return failure();
-
-    setAttr("data_format", StringAttr::get("NCHW", context));
+    target_data_format = "NCHW";
 
   } else if (data_format() == "NCHW") {
     static constexpr std::array<int64_t, 4> kPerm = {0, 3, 1, 2};  // to NCHW
     if (permutation != ArrayRef<int64_t>(kPerm)) return failure();
-
-    setAttr("data_format", StringAttr::get("NHWC", context));
+    target_data_format = "NHWC";
 
   } else {
     return failure();
   }
 
-  auto shuffle_attr = [&](ArrayAttr attr) -> ArrayAttr {
-    SmallVector<Attribute, 4> values{attr.begin(), attr.end()};
-    SmallVector<Attribute, 4> shuffled(values.size());
+  auto perm = GetDataFormatPermutation(data_format(), target_data_format);
+  if (perm.empty()) return failure();
 
-    for (size_t i = 0; i < permutation.size(); ++i)
-      shuffled[permutation[i]] = values[i];
-
-    return ArrayAttr::get(shuffled, context);
-  };
-
-  setAttr("strides", shuffle_attr(strides()));
-  setAttr("ksize", shuffle_attr(ksize()));
-
-  auto shuffle_type = [&](Type type) -> Type {
-    if (auto ranked_type = type.dyn_cast<RankedTensorType>()) {
-      ArrayRef<int64_t> shape = ranked_type.getShape();
-      assert(permutation.size() == shape.size());
-
-      SmallVector<int64_t, 4> new_shape(permutation.size());
-      for (size_t i = 0; i < permutation.size(); ++i)
-        new_shape[permutation[i]] = shape[i];
-
-      return RankedTensorType::get(new_shape, ranked_type.getElementType());
-    }
-
-    return type;
-  };
+  setAttr("data_format", StringAttr::get(target_data_format, context));
+  setAttr("strides", ShuffleArrayAttr(strides(), perm));
+  setAttr("ksize", ShuffleArrayAttr(ksize(), perm));
 
   OpResult result = getOperation()->getResult(0);
-  result.setType(shuffle_type(result.getType()));
+  result.setType(ShuffleRankedTensorType(result.getType(), perm));
 
   return success();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment.mlir
index e8d667aea0f..983eabbbb02 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment.mlir
@@ -38,4 +38,38 @@ func @transposeBiasWithUnknownShape(%arg0: tensor<1x4x4x8xf32>, %arg1: tensor<8x
   %0 = "tf.BiasAdd"(%arg0, %arg1) : (tensor<1x4x4x8xf32>, tensor<8xf32>) -> tensor<*xf32>
 
   return %0 : tensor<*xf32>
-}
\ No newline at end of file
+}
+
+// CHECK-LABEL: func @transposeConv2D
+func @transposeConv2D(%input: tensor<1x32x32x3xf32>, %filter: tensor<1x1x3x8xf32>) -> tensor<1x32x32x8xf32> {
+
+  // IMPORTANT: Tensor shapes do not match convolution parameters (stride,
+  // dilations, etc...). This test only verifies that changing convolution data
+  // layout will update all the attributes.
+
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
+
+  // CHECK: %[[CONV2D:[0-9]*]] = "tf.Conv2D"(%[[ARG_TRANSPOSE]], %arg1)
+  // CHECK-SAME: data_format = "NCHW"
+  // CHECK-SAME: dilations = [1, 4, 2, 3]
+  // CHECK-SAME: explicit_paddings = [1, 2, 7, 8, 3, 4, 5, 6]
+  // CHECK-SAME: padding = "EXPLICIT"
+  // CHECK-SAME: strides = [5, 8, 6, 7]
+  // CHECK-SAME: (tensor<1x3x32x32xf32>, tensor<1x1x3x8xf32>) -> tensor<1x8x32x32xf32>
+
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
+  // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[CONV2D]], %[[RES_PERM]])
+  // CHECK: return %[[RES_TRANSPOSE]]
+
+  %0 = "tf.Conv2D"(%input, %filter)
+       {
+         data_format = "NHWC",
+         dilations = [1, 2, 3, 4],
+         explicit_paddings = [1, 2, 3, 4, 5, 6, 7, 8],
+         padding = "EXPLICIT",
+         strides = [5, 6, 7, 8]
+       } : (tensor<1x32x32x3xf32>, tensor<1x1x3x8xf32>) -> tensor<1x32x32x8xf32>
+
+  return %0 : tensor<1x32x32x8xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
index d642b093e6b..3fd410aa118 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
@@ -96,22 +96,6 @@ Permutation GetDataFormatPermutation(StringRef from_data_format,
   }
 }
 
-Type PermuteRankedTensorType(Type type, Permutation permutation) {
-  if (auto ranked_type = type.dyn_cast<RankedTensorType>()) {
-    ArrayRef<int64_t> shape = ranked_type.getShape();
-    assert(permutation.size() == shape.size());
-
-    SmallVector<int64_t, 4> new_shape(permutation.size());
-    for (size_t i = 0; i < permutation.size(); ++i) {
-      new_shape[i] = shape[permutation[i]];
-    }
-
-    return RankedTensorType::get(new_shape, ranked_type.getElementType());
-  }
-
-  return type;
-}
-
 void LayoutAssignmentPass::runOnFunction() {
   FuncOp func = getFunction();
 
@@ -144,8 +128,8 @@ void LayoutAssignmentPass::runOnFunction() {
     };
 
     // Change operation data format.
-    op->setAttr("data_format",
-                StringAttr::get(force_data_format_, op->getContext()));
+    if (failed(layout_sensitive_interface.UpdateDataFormat(force_data_format_)))
+      return;
 
     // Permute arguments into the target data format.
     builder.setInsertionPoint(op);
@@ -162,8 +146,6 @@ void LayoutAssignmentPass::runOnFunction() {
 
     for (int64_t res : layout_sensitive_interface.GetLayoutDependentResults()) {
       OpResult result = op->getResult(res);
-      result.setType(
-          PermuteRankedTensorType(result.getType(), args_permutation));
 
       auto transposed_res = builder.create<TransposeOp>(loc, result, res_perm);
       result.replaceAllUsesWith(transposed_res);

From 4030aa1fe5bdd846301f379d1f1a0e58efbceae4 Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Thu, 20 Feb 2020 14:39:51 -0800
Subject: [PATCH 387/442] Stub out TFLITE_ASSERT_FALSE for NDEBUG builds.

Also,
 * add custom debug_log for xtensa-xpg that will be empty for NDEBUG builds.
 * Linux builds via the Makefile now do not have -DNDEBUG to be consistent with
   the bazel builds.

PiperOrigin-RevId: 296300474
Change-Id: Ia1473e23bd8705f520beace7ee704479b0c52117
---
 tensorflow/lite/kernels/op_macros.h           |  6 +--
 tensorflow/lite/micro/tools/make/Makefile     |  4 +-
 .../tools/make/targets/bluepill_makefile.inc  |  3 ++
 tensorflow/lite/micro/xtensa-xpg/debug_log.cc | 45 +++++++++++++++++++
 4 files changed, 53 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/lite/micro/xtensa-xpg/debug_log.cc

diff --git a/tensorflow/lite/kernels/op_macros.h b/tensorflow/lite/kernels/op_macros.h
index 44208007b8a..33d033b10b6 100644
--- a/tensorflow/lite/kernels/op_macros.h
+++ b/tensorflow/lite/kernels/op_macros.h
@@ -31,7 +31,7 @@ inline void InfiniteLoop() {
   while (1) {
   }
 }
-#define TFLITE_ASSERT_FALSE InfiniteLoop();
+
 #define TFLITE_ABORT InfiniteLoop();
 
 #else  // TF_LITE_MCU_DEBUG_LOG
@@ -47,14 +47,14 @@ inline void InfiniteLoop() {
 
 #define TFLITE_ABORT abort()
 
+#endif  // TF_LITE_MCU_DEBUG_LOG
+
 #ifdef NDEBUG
 #define TFLITE_ASSERT_FALSE (static_cast<void>(0))
 #else
 #define TFLITE_ASSERT_FALSE TFLITE_ABORT
 #endif
 
-#endif  // TF_LITE_MCU_DEBUG_LOG
-
 #define TF_LITE_FATAL(msg)  \
   do {                      \
     DEBUG_LOG(msg);         \
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 8ce1974c437..1dc45f88cb9 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -68,10 +68,10 @@ MICROLITE_LIBS := -lm
 # There are no rules for compiling objects for the host system (since we don't
 # generate things like the protobuf compiler that require that), so all of
 # these settings are for the target compiler.
-CXXFLAGS := -O3 -DNDEBUG
+CXXFLAGS := -O3
 CXXFLAGS += -std=c++11 -g -DTF_LITE_STATIC_MEMORY
 CXXFLAGS += -fno-rtti
-CCFLAGS := -DNDEBUG -g -DTF_LITE_STATIC_MEMORY
+CCFLAGS := -g -DTF_LITE_STATIC_MEMORY
 LDOPTS := -L/usr/local/lib
 ARFLAGS := -r
 TARGET_TOOLCHAIN_PREFIX :=
diff --git a/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc b/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
index 65155dfedb8..878067cf083 100644
--- a/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
@@ -8,10 +8,13 @@ ifeq ($(TARGET), bluepill)
   $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,))
   $(eval $(call add_third_party_download,$(STM32_BARE_LIB_URL),$(STM32_BARE_LIB_MD5),stm32_bare_lib,))
 
+  # TODO(b/149943573): It may be worthwhile to remove -DNDEBUG if we can get the
+  # bluepill target to compile without it.
   PLATFORM_FLAGS = \
     -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
     -DTF_LITE_STATIC_MEMORY \
     -DTF_LITE_MCU_DEBUG_LOG \
+    -DNDEBUG \
     -fno-rtti \
     -fmessage-length=0 \
     -fno-exceptions \
diff --git a/tensorflow/lite/micro/xtensa-xpg/debug_log.cc b/tensorflow/lite/micro/xtensa-xpg/debug_log.cc
new file mode 100644
index 00000000000..a95a084978b
--- /dev/null
+++ b/tensorflow/lite/micro/xtensa-xpg/debug_log.cc
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Reference implementation of the DebugLog() function that's required for a
+// platform to support the TensorFlow Lite for Microcontrollers library. This is
+// the only function that's absolutely required to be available on a target
+// device, since it's used for communicating test results back to the host so
+// that we can verify the implementation is working correctly.
+// It's designed to be as easy as possible to supply an implementation though.
+// On platforms that have a POSIX stack or C library, it can be written as a
+// single call to `fprintf(stderr, "%s", s)` to output a string to the error
+// stream of the console, but if there's no OS or C library available, there's
+// almost always an equivalent way to write out a string to some serial
+// interface that can be used instead. For example on Arm M-series MCUs, calling
+// the `bkpt #0xAB` assembler instruction will output the string in r1 to
+// whatever debug serial connection is available. If you're running mbed, you
+// can do the same by creating `Serial pc(USBTX, USBRX)` and then calling
+// `pc.printf("%s", s)`.
+// To add an equivalent function for your own platform, create your own
+// implementation file, and place it in a subfolder with named after the OS
+// you're targeting. For example, see the Cortex M bare metal version in
+// tensorflow/lite/micro/bluepill/debug_log.cc or the mbed one on
+// tensorflow/lite/micro/mbed/debug_log.cc.
+
+#include "tensorflow/lite/micro/debug_log.h"
+
+#include <cstdio>
+
+extern "C" void DebugLog(const char* s) {
+#ifndef NDEBUG
+  fprintf(stderr, "%s", s);
+#endif
+}

From 3ba8bd697faf4b831f78c3fa547d7956f1b1a0aa Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Thu, 20 Feb 2020 14:50:37 -0800
Subject: [PATCH 388/442] Fix the cache key problem when
 compute_output_shape().

This is a very tricky one wrt the id() of int in python. Under the hood,
id returns memory address for the int, and python has a cache location
for the ints, which result into different ints get same hash value.

Changed to use tuples of shape itself as the dict key, since the tuple
itself is immutable and hashable. Same tuple value will return the same
hash value.

Also remove the generic utils for that where network.py is only usage
for that function.

Fix #32029

PiperOrigin-RevId: 296302946
Change-Id: I865c9380a06ed6ee80fea7f942c21c4d102473c2
---
 tensorflow/python/keras/engine/network.py      | 6 ++++--
 tensorflow/python/keras/engine/network_test.py | 9 +++++++++
 tensorflow/python/keras/utils/generic_utils.py | 6 ------
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 166553a324b..79f15d9f3ae 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -720,7 +720,9 @@ class Network(base_layer.Layer):
                        ': model has ' + str(len(self._input_layers)) +
                        ' tensor inputs.')
 
-    cache_key = generic_utils.object_list_uid(input_shape)
+    # Use the tuple of TensorShape as the cache key, since tuple is hashable
+    # and can be used as hash key.
+    cache_key = tuple(tf_utils.convert_shapes(input_shape, to_tuples=True))
     if cache_key in self._output_shape_cache:
       # Cache hit. Return shapes as TensorShapes.
       return self._output_shape_cache[cache_key]
@@ -905,7 +907,7 @@ class Network(base_layer.Layer):
 
     if output_shapes is not None:
       input_shapes = [x.shape for x in inputs]
-      cache_key = generic_utils.object_list_uid(input_shapes)
+      cache_key = tuple(tf_utils.convert_shapes(input_shapes, to_tuples=True))
       self._output_shape_cache[cache_key] = nest.pack_sequence_as(
           self._nested_outputs, output_shapes)
 
diff --git a/tensorflow/python/keras/engine/network_test.py b/tensorflow/python/keras/engine/network_test.py
index b3e19f2a6ea..17f08889936 100644
--- a/tensorflow/python/keras/engine/network_test.py
+++ b/tensorflow/python/keras/engine/network_test.py
@@ -1869,6 +1869,15 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
     self.assertEqual(network.dynamic, False)
     self.assertEqual(network.stateful, False)
 
+  def test_compute_output_shape_cache(self):
+    # See https://github.com/tensorflow/tensorflow/issues/32029.
+    x = input_layer_lib.Input(shape=(None, 32))
+    dense = keras.layers.Dense(2)
+    y = dense(x)
+    network = network_lib.Network(x, y, name='dense_network')
+
+    for i in range(999, 1024):
+      self.assertEqual(network.compute_output_shape((1, i, 32)), (1, i, 2))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/utils/generic_utils.py b/tensorflow/python/keras/utils/generic_utils.py
index edbfed6d776..9ee644bf8cd 100644
--- a/tensorflow/python/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/utils/generic_utils.py
@@ -756,12 +756,6 @@ def to_list(x):
   return [x]
 
 
-def object_list_uid(object_list):
-  """Creates a single string from object ids."""
-  object_list = nest.flatten(object_list)
-  return ', '.join(str(abs(id(x))) for x in object_list)
-
-
 def to_snake_case(name):
   intermediate = re.sub('(.)([A-Z][a-z0-9]+)', r'\1_\2', name)
   insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower()

From f120f7d514d50428bc34b4435ea8253f5cece990 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 20 Feb 2020 14:59:42 -0800
Subject: [PATCH 389/442] Suppress 'conversion to a dense matrix' warning from
 LinearOperatorFullMatrix.solve().

The current warning is inappropriate: since a LinearOperatorFullMatrix is inherently dense, no efficiency is lost when we treat it as dense.

PiperOrigin-RevId: 296305093
Change-Id: Id3b7e2a00f05d1e516374c4241cd84529844a056
---
 tensorflow/python/ops/linalg/linear_operator.py  | 16 ++++++++++------
 .../ops/linalg/linear_operator_full_matrix.py    |  3 +++
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 194889c1ad5..4a181d72f2a 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -751,14 +751,11 @@ class LinearOperator(module.Module):
     with self._name_scope(name):
       return self._log_abs_determinant()
 
-  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
-    """Default implementation of _solve."""
-    if self.is_square is False:
+  def _dense_solve(self, rhs, adjoint=False, adjoint_arg=False):
+    """Solve by conversion to a dense matrix."""
+    if self.is_square is False:  # pylint: disable=g-bool-id-comparison
       raise NotImplementedError(
           "Solve is not yet implemented for non-square operators.")
-    logging.warn(
-        "Using (possibly slow) default implementation of solve."
-        "  Requires conversion to a dense matrix and O(N^3) operations.")
     rhs = linalg.adjoint(rhs) if adjoint_arg else rhs
     if self._can_use_cholesky():
       return linalg_ops.cholesky_solve(
@@ -766,6 +763,13 @@ class LinearOperator(module.Module):
     return linear_operator_util.matrix_solve_with_broadcast(
         self.to_dense(), rhs, adjoint=adjoint)
 
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    """Default implementation of _solve."""
+    logging.warn(
+        "Using (possibly slow) default implementation of solve."
+        "  Requires conversion to a dense matrix and O(N^3) operations.")
+    return self._dense_solve(rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
   def solve(self, rhs, adjoint=False, adjoint_arg=False, name="solve"):
     """Solve (exact or approx) `R` (batch) systems of equations: `A X = rhs`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
index 8fe68919250..8d92d1accaa 100644
--- a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
+++ b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
@@ -183,5 +183,8 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
     return math_ops.matmul(
         self._matrix, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
 
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    return self._dense_solve(rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
   def _to_dense(self):
     return self._matrix

From bdee60d828b3c02a7d371ec1ac3d12a616c51dfd Mon Sep 17 00:00:00 2001
From: Paul Donnelly <pauldonnelly@google.com>
Date: Thu, 20 Feb 2020 15:05:38 -0800
Subject: [PATCH 390/442] Stop the gradient for QuantizeAndDequantizeV2 when
 the input is out of range.

PiperOrigin-RevId: 296306551
Change-Id: Idcc1153ed7bfcac6cd9b6533800bf753d6ec166e
---
 tensorflow/cc/gradients/array_grad.cc         |  29 +++--
 .../api_def_QuantizeAndDequantizeV2Grad.pbtxt |   8 ++
 .../api_def_QuantizeAndDequantizeV2Grad.pbtxt |   3 +
 .../api_def_QuantizeAndDequantizeV2Grad.pbtxt |   4 +
 .../kernels/quantize_and_dequantize_op.cc     | 116 ++++++++++++++++++
 .../core/kernels/quantize_and_dequantize_op.h |  71 +++++++++++
 .../quantize_and_dequantize_op_gpu.cu.cc      |  40 ++++++
 .../quantize_and_dequantize_op_test.cc        |  48 ++++++++
 tensorflow/core/ops/array_ops.cc              |  32 +++++
 .../eager/pywrap_gradient_exclusions.cc       |   5 +-
 tensorflow/python/ops/array_grad.py           |   5 -
 tensorflow/python/ops/array_ops.py            |  17 +++
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |   4 +
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |   4 +
 14 files changed, 369 insertions(+), 17 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
 create mode 100644 tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt

diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc
index e9173227aad..3c0813bfe23 100644
--- a/tensorflow/cc/gradients/array_grad.cc
+++ b/tensorflow/cc/gradients/array_grad.cc
@@ -15,13 +15,12 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/cc/framework/grad_op_registry.h"
+#include "tensorflow/cc/framework/gradients.h"
 #include "tensorflow/cc/ops/array_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
-#include "tensorflow/cc/framework/grad_op_registry.h"
-#include "tensorflow/cc/framework/gradients.h"
-
 namespace tensorflow {
 namespace ops {
 namespace {
@@ -90,15 +89,25 @@ Status QuantizeAndDequantizeGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("QuantizeAndDequantize", QuantizeAndDequantizeGrad);
 
-Status QuantizeAndDequantizeV2Grad(const Scope& scope, const Operation& op,
-                                   const std::vector<Output>& grad_inputs,
-                                   std::vector<Output>* grad_outputs) {
-  grad_outputs->push_back(Identity(scope, grad_inputs[0]));
-  grad_outputs->push_back(NoGradient());
-  grad_outputs->push_back(NoGradient());
+Status QuantizeAndDequantizeV2GradHelper(const Scope& scope,
+                                         const Operation& op,
+                                         const std::vector<Output>& grad_inputs,
+                                         std::vector<Output>* grad_outputs) {
+  Input input = Shape(scope, op.input(0));
+  Input input_min = op.input(1);
+  Input input_max = op.input(2);
+  int64 axis;
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "axis", &axis));
+  auto qdq_v2_grad = QuantizeAndDequantizeV2Grad(
+      scope, grad_inputs[0], input, input_min, input_max,
+      QuantizeAndDequantizeV2Grad::Axis(axis));
+  grad_outputs->push_back(qdq_v2_grad.input_backprop);
+  grad_outputs->push_back(qdq_v2_grad.input_min_backprop);
+  grad_outputs->push_back(qdq_v2_grad.input_max_backprop);
   return scope.status();
 }
-REGISTER_GRADIENT_OP("QuantizeAndDequantizeV2", QuantizeAndDequantizeV2Grad);
+REGISTER_GRADIENT_OP("QuantizeAndDequantizeV2",
+                     QuantizeAndDequantizeV2GradHelper);
 
 Status QuantizeAndDequantizeV3Grad(const Scope& scope, const Operation& op,
                                    const std::vector<Output>& grad_inputs,
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
new file mode 100644
index 00000000000..6a7a2f38897
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV2Grad"
+  summary: "Returns the gradient of `QuantizeAndDequantizeV2`."
+  description: <<END
+Returns a gradient of 1 for inputs that are within the quantization range,
+or 0 otherwise.
+END
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
new file mode 100644
index 00000000000..f9f898e9c78
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV2Grad"
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
new file mode 100644
index 00000000000..efa83a9d8e8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV2Grad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.cc b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
index 8f71d09c083..9fdf374e455 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
@@ -131,6 +131,75 @@ class QuantizeAndDequantizeV2Op : public OpKernel {
   bool narrow_range_;
 };
 
+// Implementation of QuantizeAndDequantizeV2GradientOp.
+// When back-propagating the error through a quantized layer, the following
+// paper gives evidence that clipped-ReLU is better than non-clipped:
+// "Deep Learning with Low Precision by Half-wave Gaussian Quantization"
+// http://zpascal.net/cvpr2017/Cai_Deep_Learning_With_CVPR_2017_paper.pdf
+template <typename Device, typename T>
+class QuantizeAndDequantizeV2GradientOp : public OpKernel {
+ public:
+  explicit QuantizeAndDequantizeV2GradientOp(OpKernelConstruction* ctx)
+      : OpKernel::OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("axis", &axis_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& gradient = ctx->input(0);
+    const Tensor& input = ctx->input(1);
+    Tensor* input_backprop = nullptr;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, input.shape(), &input_backprop));
+
+    OP_REQUIRES(
+        ctx, input.IsSameSize(gradient),
+        errors::InvalidArgument("gradient and input must be the same size"));
+    const int depth = (axis_ == -1) ? 1 : input.dim_size(axis_);
+    const Tensor& input_min_tensor = ctx->input(2);
+    const Tensor& input_max_tensor = ctx->input(3);
+    if (axis_ != -1) {
+      OP_REQUIRES(
+          ctx, input_min_tensor.dim_size(0) == depth,
+          errors::InvalidArgument("min has incorrect size, expected ", depth,
+                                  " was ", input_min_tensor.dim_size(0)));
+      OP_REQUIRES(
+          ctx, input_max_tensor.dim_size(0) == depth,
+          errors::InvalidArgument("max has incorrect size, expected ", depth,
+                                  " was ", input_max_tensor.dim_size(0)));
+    }
+
+    TensorShape min_max_shape(input_min_tensor.shape());
+    Tensor* input_min_backprop;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(1, min_max_shape, &input_min_backprop));
+
+    Tensor* input_max_backprop;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(2, min_max_shape, &input_max_backprop));
+
+    if (axis_ == -1) {
+      functor::QuantizeAndDequantizeOneScaleGradientFunctor<Device, T> f;
+      f(ctx->eigen_device<Device>(), gradient.template flat<T>(),
+        input.template flat<T>(), input_min_tensor.scalar<T>(),
+        input_max_tensor.scalar<T>(), input_backprop->template flat<T>(),
+        input_min_backprop->template scalar<T>(),
+        input_max_backprop->template scalar<T>());
+    } else {
+      functor::QuantizeAndDequantizePerChannelGradientFunctor<Device, T> f;
+      f(ctx->eigen_device<Device>(),
+        gradient.template flat_inner_outer_dims<T, 3>(axis_ - 1),
+        input.template flat_inner_outer_dims<T, 3>(axis_ - 1),
+        &input_min_tensor, &input_max_tensor,
+        input_backprop->template flat_inner_outer_dims<T, 3>(axis_ - 1),
+        input_min_backprop->template flat<T>(),
+        input_max_backprop->template flat<T>());
+    }
+  }
+
+ private:
+  int axis_;
+};
+
 // Simulate quantization precision loss in a float tensor by:
 // 1. Quantize the tensor to fixed point numbers, which should match the target
 //    quantization method when it is used in inference.
@@ -295,6 +364,43 @@ struct QuantizeAndDequantizePerChannelFunctor<CPUDevice, T> {
         input_max_tensor, round_mode, narrow_range, out);
   }
 };
+
+template <typename T>
+struct QuantizeAndDequantizeOneScaleGradientFunctor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat gradient,
+                  typename TTypes<T>::ConstFlat input,
+                  typename TTypes<T>::ConstScalar input_min_tensor,
+                  typename TTypes<T>::ConstScalar input_max_tensor,
+                  typename TTypes<T>::Flat input_backprop,
+                  typename TTypes<T>::Scalar input_min_backprop,
+                  typename TTypes<T>::Scalar input_max_backprop) {
+    QuantizeAndDequantizeOneScaleGradientImpl<CPUDevice, T>::Compute(
+        d, gradient, input, input_min_tensor, input_max_tensor, input_backprop,
+        input_min_backprop, input_max_backprop);
+  }
+};
+
+template <typename T>
+struct QuantizeAndDequantizePerChannelGradientFunctor<CPUDevice, T> {
+  void operator()(const CPUDevice& d,
+                  typename TTypes<T, 3>::ConstTensor gradient,
+                  typename TTypes<T, 3>::ConstTensor input,
+                  const Tensor* input_min_tensor,
+                  const Tensor* input_max_tensor,
+                  typename TTypes<T, 3>::Tensor input_backprop,
+                  typename TTypes<T>::Flat input_min_backprop,
+                  typename TTypes<T>::Flat input_max_backprop) {
+    QuantizeAndDequantizePerChannelGradientImpl<CPUDevice, T>::Compute(
+        d, gradient, input, input_min_tensor, input_max_tensor, input_backprop,
+        input_min_backprop, input_max_backprop);
+  }
+};
+
+template struct functor::QuantizeAndDequantizeOneScaleGradientFunctor<CPUDevice,
+                                                                      float>;
+template struct functor::QuantizeAndDequantizePerChannelGradientFunctor<
+    CPUDevice, double>;
+
 }  // namespace functor
 
 #define REGISTER_CPU_KERNEL(T)                                                 \
@@ -302,6 +408,10 @@ struct QuantizeAndDequantizePerChannelFunctor<CPUDevice, T> {
                               .Device(DEVICE_CPU)                              \
                               .TypeConstraint<T>("T"),                         \
                           QuantizeAndDequantizeV2Op<CPUDevice, T>);            \
+  REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV2Grad")                  \
+                              .Device(DEVICE_CPU)                              \
+                              .TypeConstraint<T>("T"),                         \
+                          QuantizeAndDequantizeV2GradientOp<CPUDevice, T>);    \
   REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV3")                      \
                               .Device(DEVICE_CPU)                              \
                               .TypeConstraint<T>("T"),                         \
@@ -322,6 +432,12 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
                               .HostMemory("input_max")                         \
                               .TypeConstraint<T>("T"),                         \
                           QuantizeAndDequantizeV2Op<GPUDevice, T>);            \
+  REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV2Grad")                  \
+                              .Device(DEVICE_GPU)                              \
+                              .HostMemory("input_min")                         \
+                              .HostMemory("input_max")                         \
+                              .TypeConstraint<T>("T"),                         \
+                          QuantizeAndDequantizeV2GradientOp<GPUDevice, T>);    \
   REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV3")                      \
                               .Device(DEVICE_GPU)                              \
                               .HostMemory("input_min")                         \
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.h b/tensorflow/core/kernels/quantize_and_dequantize_op.h
index 4dd6e5c839b..c286a10a9c6 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.h
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.h
@@ -60,6 +60,28 @@ struct QuantizeAndDequantizePerChannelFunctor {
                   typename TTypes<T, 3>::Tensor output);
 };
 
+template <typename Device, typename T>
+struct QuantizeAndDequantizeOneScaleGradientFunctor {
+  void operator()(const Device& d, typename TTypes<T>::ConstFlat gradient,
+                  typename TTypes<T>::ConstFlat input,
+                  typename TTypes<T>::ConstScalar input_min,
+                  typename TTypes<T>::ConstScalar input_max,
+                  typename TTypes<T>::Flat input_backprop,
+                  typename TTypes<T>::Scalar input_min_backprop,
+                  typename TTypes<T>::Scalar input_max_backprop);
+};
+
+template <typename Device, typename T>
+struct QuantizeAndDequantizePerChannelGradientFunctor {
+  void operator()(const Device& d, typename TTypes<T, 3>::ConstTensor gradient,
+                  typename TTypes<T, 3>::ConstTensor input,
+                  const Tensor* input_min_tensor,
+                  const Tensor* input_max_tensor,
+                  typename TTypes<T, 3>::Tensor input_backprop,
+                  typename TTypes<T>::Flat input_min_backprop,
+                  typename TTypes<T>::Flat input_max_backprop);
+};
+
 // The implementation below runs on both CPU and GPU.
 template <typename Device, typename T, typename Func,
           typename Vec = typename TTypes<T>::Vec,
@@ -249,6 +271,55 @@ struct QuantizeAndDequantizePerChannelImpl {
   }
 };
 
+template <typename Device, typename T>
+struct QuantizeAndDequantizeOneScaleGradientImpl {
+  static void Compute(const Device& d, typename TTypes<T>::ConstFlat gradient,
+                      typename TTypes<T>::ConstFlat input,
+                      typename TTypes<T>::ConstScalar input_min,
+                      typename TTypes<T>::ConstScalar input_max,
+                      typename TTypes<T>::Flat input_backprop,
+                      typename TTypes<T>::Scalar input_min_backprop,
+                      typename TTypes<T>::Scalar input_max_backprop) {
+    const T min_val = input_min();
+    const T max_val = input_max();
+    const auto in_range =
+        (input >= min_val && input <= max_val)
+            .select(input.constant(1.0f), input.constant(0.0f));
+    input_backprop.device(d) = gradient * in_range;
+    input_min_backprop.device(d) = input_min_backprop.constant(0.0f);
+    input_max_backprop.device(d) = input_max_backprop.constant(0.0f);
+  }
+};
+
+template <typename Device, typename T>
+struct QuantizeAndDequantizePerChannelGradientImpl {
+  static void Compute(const Device& d,
+                      typename TTypes<T, 3>::ConstTensor gradient,
+                      typename TTypes<T, 3>::ConstTensor input,
+                      const Tensor* input_min_tensor,
+                      const Tensor* input_max_tensor,
+                      typename TTypes<T, 3>::Tensor input_backprop,
+                      typename TTypes<T>::Flat input_min_backprop,
+                      typename TTypes<T>::Flat input_max_backprop) {
+    using Index = typename tensorflow::TTypes<T>::ConstTensor::Index;
+    auto input_min = input_min_tensor->vec<T>();
+    auto input_max = input_max_tensor->vec<T>();
+    int num_channels = input.dimension(1);
+    for (Index i = 0; i < num_channels; ++i) {
+      const auto gradient_chip = gradient.template chip<1>(i);
+      const auto input_chip = input.template chip<1>(i);
+      const T min_val = input_min(i);
+      const T max_val = input_max(i);
+      const auto in_range =
+          (input_chip >= min_val && input_chip <= max_val)
+              .select(input_chip.constant(1.0f), input_chip.constant(0.0f));
+      input_backprop.template chip<1>(i).device(d) = gradient_chip * in_range;
+    }
+    input_min_backprop.device(d) = input_min_backprop.constant(0.0f);
+    input_max_backprop.device(d) = input_max_backprop.constant(0.0f);
+  }
+};
+
 }  // end of namespace functor
 }  // end of namespace tensorflow
 
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
index f3bb41071cb..9f074535770 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
@@ -53,6 +53,37 @@ struct QuantizeAndDequantizePerChannelFunctor<GPUDevice, T> {
   }
 };
 
+template <typename T>
+struct QuantizeAndDequantizeOneScaleGradientFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::ConstFlat gradient,
+                  typename TTypes<T>::ConstFlat input,
+                  typename TTypes<T>::ConstScalar input_min_tensor,
+                  typename TTypes<T>::ConstScalar input_max_tensor,
+                  typename TTypes<T>::Flat input_backprop,
+                  typename TTypes<T>::Scalar input_min_backprop,
+                  typename TTypes<T>::Scalar input_max_backprop) {
+    QuantizeAndDequantizeOneScaleGradientImpl<GPUDevice, T>::Compute(
+        d, gradient, input, input_min_tensor, input_max_tensor, input_backprop,
+        input_min_backprop, input_max_backprop);
+  }
+};
+
+template <typename T>
+struct QuantizeAndDequantizePerChannelGradientFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d,
+                  typename TTypes<T, 3>::ConstTensor gradient,
+                  typename TTypes<T, 3>::ConstTensor input,
+                  const Tensor* input_min_tensor,
+                  const Tensor* input_max_tensor,
+                  typename TTypes<T, 3>::Tensor input_backprop,
+                  typename TTypes<T>::Flat input_min_backprop,
+                  typename TTypes<T>::Flat input_max_backprop) {
+    QuantizeAndDequantizePerChannelGradientImpl<GPUDevice, T>::Compute(
+        d, gradient, input, input_min_tensor, input_max_tensor, input_backprop,
+        input_min_backprop, input_max_backprop);
+  }
+};
+
 }  // end namespace functor
 
 // Instantiate the GPU implementation for float and double.
@@ -65,6 +96,15 @@ template struct functor::QuantizeAndDequantizePerChannelFunctor<GPUDevice,
 template struct functor::QuantizeAndDequantizePerChannelFunctor<GPUDevice,
                                                                 double>;
 
+template struct functor::QuantizeAndDequantizeOneScaleGradientFunctor<GPUDevice,
+                                                                      float>;
+template struct functor::QuantizeAndDequantizeOneScaleGradientFunctor<GPUDevice,
+                                                                      double>;
+template struct functor::QuantizeAndDequantizePerChannelGradientFunctor<
+    GPUDevice, float>;
+template struct functor::QuantizeAndDequantizePerChannelGradientFunctor<
+    GPUDevice, double>;
+
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index 90764b0feb2..9f8aa8e6f77 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -362,6 +362,54 @@ TEST_P(ParameterizedQuantizeAndDequantizeTest,
   }
 }
 
+// Convert a 1D tensor with signed 8 bits and round_mode half_up.
+TEST_P(ParameterizedQuantizeAndDequantizeTest, GradientV2_op) {
+  const int axis = GetParam();
+  TF_ASSERT_OK(NodeDefBuilder("qdq_v2_grad_op", "QuantizeAndDequantizeV2Grad")
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("axis", axis)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const std::vector<int64> dims = {2, 3, 4, 5};
+  // Input gradient. (repeating 11 values multiplied by (slice_idx + 1))
+  auto gradients = ScalePerSliceAlongAxis<float>(
+      dims, axis, {1, -2, -3, 4, 5, 6, -7, -8, -9, -10, 11});
+  AddInputFromArray<float>(TensorShape(dims), gradients);
+  // Forward op inputs. (repeating 7 values multiplied by (slice_idx + 1)).
+  auto inputs = ScalePerSliceAlongAxis<float>(
+      dims, axis, {-1, -0.5, 0, 0.3, 0.8, 0.55, 0.6});
+  AddInputFromArray<float>(TensorShape(dims), inputs);
+  const int num_slices = (axis == -1) ? 1 : dims[axis];
+  const TensorShape range_shape =
+      (axis == -1) ? TensorShape({}) : TensorShape({num_slices});
+  std::vector<float> input_min_values(num_slices), input_max_values(num_slices);
+  for (int i = 0; i < num_slices; ++i) {
+    input_max_values[i] = 0.8f + i * 0.4f;
+    input_min_values[i] = -input_max_values[i];
+  }
+  AddInputFromArray<float>(range_shape, input_min_values);
+  AddInputFromArray<float>(range_shape, input_max_values);
+  std::vector<float> expected_vals(inputs.size());
+  int minor_size = 1;
+  for (int i = axis + 1; i < dims.size(); ++i) {
+    minor_size *= dims[i];
+  }
+  for (int i = 0; i < inputs.size(); ++i) {
+    int slice_idx = (i / minor_size) % num_slices;
+    expected_vals[i] = ((inputs[i] >= input_min_values[slice_idx]) &&
+                        (inputs[i] <= input_max_values[slice_idx]))
+                           ? gradients[i]
+                           : 0;
+  }
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape(dims));
+  test::FillValues<float>(&expected, expected_vals);
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+}
+
 // Instantiate parameterized tests for axis = -1, 1, 3.
 INSTANTIATE_TEST_SUITE_P(All, ParameterizedQuantizeAndDequantizeTest,
                          ::testing::Values(-1, 1, 3));
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 65c9510a1f2..1d4526f506e 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -2800,6 +2800,38 @@ REGISTER_OP("QuantizeAndDequantizeV2")
       return Status::OK();
     });
 
+REGISTER_OP("QuantizeAndDequantizeV2Grad")
+    .Input("gradients: T")
+    .Input("input: T")
+    .Input("input_min: T")
+    .Input("input_max: T")
+    .Output("input_backprop: T")
+    .Output("input_min_backprop: T")
+    .Output("input_max_backprop: T")
+    .Attr("T: {bfloat16, half, float, double}")
+    .Attr("axis: int = -1")
+    .SetShapeFn([](InferenceContext* c) {
+      int axis;
+      TF_RETURN_IF_ERROR(c->GetAttr("axis", &axis));
+      const int minmax_rank = (axis == -1) ? 0 : 1;
+      ShapeHandle minmax;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), minmax_rank, &minmax));
+      TF_RETURN_IF_ERROR(c->Merge(c->input(3), minmax, &minmax));
+      if (axis != -1) {
+        ShapeHandle input;
+        TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), axis + 1, &input));
+        DimensionHandle depth;
+        TF_RETURN_IF_ERROR(
+            c->Merge(c->Dim(minmax, 0), c->Dim(input, axis), &depth));
+      }
+      ShapeHandle inputs;
+      TF_RETURN_IF_ERROR(c->Merge(c->input(0), c->input(1), &inputs));
+      c->set_output(0, inputs);
+      c->set_output(1, minmax);
+      c->set_output(2, minmax);
+      return Status::OK();
+    });
+
 REGISTER_OP("QuantizeAndDequantizeV3")
     .Input("input: T")
     .Input("input_min: T")
diff --git a/tensorflow/python/eager/pywrap_gradient_exclusions.cc b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
index afae0b57ee7..c483847b969 100644
--- a/tensorflow/python/eager/pywrap_gradient_exclusions.cc
+++ b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
@@ -224,7 +224,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
       {"PreventGradient"},
       {"Qr"},
       {"QuantizeAndDequantize"},
-      {"QuantizeAndDequantizeV2"},
+      {"QuantizeAndDequantizeV2Grad", 1, {3}},
       {"QuantizeAndDequantizeV3"},
       {"QueueClose"},
       {"QueueDequeue"},
@@ -410,7 +410,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 469> a = {{
+  static std::array<OpIndexInfo, 470> a = {{
       {"Abs"},
       {"AccumulateNV2"},
       {"Acos"},
@@ -652,6 +652,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
       {"Prod"},
       {"QuantizeAndDequantize"},
       {"QuantizeAndDequantizeV2"},
+      {"QuantizeAndDequantizeV2Grad"},
       {"QuantizeAndDequantizeV3"},
       {"QueueClose"},
       {"QueueEnqueue"},
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index e1ed4443017..73edd9e2b62 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -959,11 +959,6 @@ def _QuantizeAndDequantizeGrad(_, grad):
   return grad
 
 
-@ops.RegisterGradient("QuantizeAndDequantizeV2")
-def _QuantizeAndDequantizeV2Grad(_, grad):
-  return [grad, None, None]
-
-
 @ops.RegisterGradient("QuantizeAndDequantizeV3")
 def _QuantizeAndDequantizeV3Grad(_, grad):
   # Only propagate the gradient for the unquantized input.
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 4b805a64d36..4356874b4dc 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -3551,6 +3551,23 @@ def _FakeQuantWithMinMaxVarsPerChannelGradient(op, grad):
       narrow_range=op.get_attr("narrow_range"))
 
 
+@ops.RegisterGradient("QuantizeAndDequantizeV2")
+def _QuantizeAndDequantizeV2Grad(op, grad):
+  """Gradient for QuantizeAndDequantizeV2 op."""
+  return quantize_and_dequantize_v2_grad(
+      grad,
+      op.inputs[0],
+      op.inputs[1],
+      op.inputs[2],
+      axis=op.get_attr("axis"))
+
+
+@ops.RegisterGradient("QuantizeAndDequantizeV2Grad")
+def _QuantizeAndDequantizeV2GradGrad(op, grad):
+  """Gradient for QuantizeAndDequantizeV2Grad op."""
+  return _QuantizeAndDequantizeV2Grad(op, grad)
+
+
 @tf_export("required_space_to_batch_paddings")
 def required_space_to_batch_paddings(input_shape,
                                      block_shape,
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 853f67c12de..78a2bb6f8cb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -2784,6 +2784,10 @@ tf_module {
     name: "QuantizeAndDequantizeV2"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'False\', \'-1\', \'None\'], "
   }
+  member_method {
+    name: "QuantizeAndDequantizeV2Grad"
+    argspec: "args=[\'gradients\', \'input\', \'input_min\', \'input_max\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
   member_method {
     name: "QuantizeAndDequantizeV3"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'num_bits\', \'signed_input\', \'range_given\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'False\', \'-1\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 853f67c12de..78a2bb6f8cb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -2784,6 +2784,10 @@ tf_module {
     name: "QuantizeAndDequantizeV2"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'False\', \'-1\', \'None\'], "
   }
+  member_method {
+    name: "QuantizeAndDequantizeV2Grad"
+    argspec: "args=[\'gradients\', \'input\', \'input_min\', \'input_max\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
   member_method {
     name: "QuantizeAndDequantizeV3"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'num_bits\', \'signed_input\', \'range_given\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'False\', \'-1\', \'None\'], "

From b1387ac3f90691ef2ba8cbb5dc6febc6f6ba7ee8 Mon Sep 17 00:00:00 2001
From: Brian Atkinson <bca@google.com>
Date: Thu, 20 Feb 2020 15:09:02 -0800
Subject: [PATCH 391/442] Automated rollback of commit
 5fc1ad961b83dd36941aa2b447a4a602b622e2c9

PiperOrigin-RevId: 296307212
Change-Id: I8ab232f276fafe6302bcd9f67d805235ab083ec3
---
 tensorflow/python/kernel_tests/decode_image_op_test.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/decode_image_op_test.py b/tensorflow/python/kernel_tests/decode_image_op_test.py
index 58678a404b4..ba5770001ad 100644
--- a/tensorflow/python/kernel_tests/decode_image_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_image_op_test.py
@@ -27,10 +27,9 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import io_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
-from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
 
-prefix_path = resource_loader.get_path_to_datafile("../../core/lib")
+prefix_path = "tensorflow/core/lib"
 
 
 class DecodeImageOpTest(test.TestCase):

From 264f7fea3449bc704449fd740f680487bcfd16fb Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Thu, 20 Feb 2020 15:20:48 -0800
Subject: [PATCH 392/442] Set the worker pool name constraints for bazel
 windows rbe toolchains.

PiperOrigin-RevId: 296309660
Change-Id: I5327f5c5005e9959e0058832a5d0cb710475d7fe
---
 third_party/toolchains/preconfig/win/BUILD    | 17 ++++++--
 .../toolchains/preconfig/win_1803/BUILD       | 39 ++++++++++++++-----
 2 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/third_party/toolchains/preconfig/win/BUILD b/third_party/toolchains/preconfig/win/BUILD
index 519d8e5110d..d612636a834 100644
--- a/third_party/toolchains/preconfig/win/BUILD
+++ b/third_party/toolchains/preconfig/win/BUILD
@@ -14,8 +14,17 @@ platform(
         "@bazel_tools//platforms:x86_64",
         "@bazel_tools//platforms:windows",
     ],
-    exec_properties = {
-        "container-image": "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:5e91ddd99345204cd8da2e687d312eb64b3916f257023fd1b651b3dabefd9286",
-        "OSFamily": "Windows",
-    },
+    remote_execution_properties = """
+        properties:{
+          name: "container-image"
+          value: "docker://gcr.io/tensorflow-testing/tf-win2019-rbe@sha256:5e91ddd99345204cd8da2e687d312eb64b3916f257023fd1b651b3dabefd9286"
+        }
+        properties:{
+          name: "OSFamily"
+          value: "Windows"
+        }
+        properties:{
+          name: "Pool" value: "win2019"
+        }
+        """,
 )
diff --git a/third_party/toolchains/preconfig/win_1803/BUILD b/third_party/toolchains/preconfig/win_1803/BUILD
index 4e315e8f086..94ac82d638c 100644
--- a/third_party/toolchains/preconfig/win_1803/BUILD
+++ b/third_party/toolchains/preconfig/win_1803/BUILD
@@ -14,10 +14,19 @@ platform(
         "@bazel_tools//platforms:x86_64",
         "@bazel_tools//platforms:windows",
     ],
-    exec_properties = {
-        "container-image": "docker://gcr.io/tensorflow-testing/tf-win-rbe@sha256:f109576c7c0c8a1783ff22b666e8923b52dbbe7933f69a1c7a7275202c304a12",
-        "OSFamily": "Windows",
-    },
+    remote_execution_properties = """
+        properties:{
+          name: "container-image"
+          value: "docker://gcr.io/tensorflow-testing/tf-win-rbe@sha256:f109576c7c0c8a1783ff22b666e8923b52dbbe7933f69a1c7a7275202c304a12"
+        }
+        properties:{
+          name: "OSFamily"
+          value: "Windows"
+        }
+        properties:{
+          name: "Pool" value: "default"
+        }
+        """,
 )
 
 platform(
@@ -26,9 +35,21 @@ platform(
         "@bazel_tools//platforms:x86_64",
         "@bazel_tools//platforms:windows",
     ],
-    exec_properties = {
-        "container-image": "",
-        "sandbox": "none",
-        "OSFamily": "Windows",
-    },
+    remote_execution_properties = """
+        properties:{
+          name: "container-image"
+          value: ""
+        }
+        properties:{
+          name: "sandbox"
+          value: "none"
+        }
+        properties:{
+          name: "OSFamily"
+          value: "Windows"
+        }
+        properties:{
+          name: "Pool" value: "default"
+        }
+        """,
 )

From 0c4f32c44ee9040cad6543eb8b77b05d713ef68d Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Thu, 20 Feb 2020 15:30:09 -0800
Subject: [PATCH 393/442] Update the Dockerfiles for manylinux2010 toolchains
 with Python3.8 installations.

PiperOrigin-RevId: 296311657
Change-Id: Ic2040763c882ff3a1295100a8ec9dbf4fea4c50d
---
 ...e.rbe.cuda10.0-cudnn7-ubuntu16.04-manylinux2010 | 12 ++++++++++++
 ...e.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010 | 14 ++++++++++++++
 .../Dockerfile.rbe.ubuntu16.04-manylinux2010       | 12 ++++++++++++
 3 files changed, 38 insertions(+)

diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu16.04-manylinux2010 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu16.04-manylinux2010
index 5f07f3adb70..e4aac64a864 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu16.04-manylinux2010
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu16.04-manylinux2010
@@ -63,11 +63,23 @@ RUN apt-get update && apt-get install -y \
 
 RUN /install/install_pip_packages.sh
 
+# Install python 3.8.
+RUN apt-get update && apt-get install -y python3.8 python3.8-dev python3.8-venv
+RUN rm -rf /var/lib/apt/lists/*
+# Have to download get-pip.py due to a pip circular issue
+# https://stackoverflow.com/questions/58758447/how-to-fix-module-platform-has-no-attribute-linux-distribution-when-instal
+RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
+RUN python3.8 get-pip.py
+RUN python3.8 -m pip install --upgrade pip setuptools wheel
+
 # TODO(klimek): Figure out a better way to get the right include paths
 # forwarded when we install new packages.
 RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
 RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
 
+RUN ln -s "/usr/include/x86_64-linux-gnu/python3.8" "/dt7/usr/include/x86_64-linux-gnu/python3.8"
+RUN ln -s "/usr/include/x86_64-linux-gnu/python3.8" "/dt8/usr/include/x86_64-linux-gnu/python3.8"
+
 # Make apt work with python 3.6.
 RUN cp /usr/lib/python3/dist-packages/apt_pkg.cpython-35m-x86_64-linux-gnu.so \
        /usr/lib/python3/dist-packages/apt_pkg.so
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
index bf65772c33a..b529147e57e 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
@@ -56,6 +56,7 @@ RUN /install/install_latest_clang.sh
 RUN /install/install_bazel.sh
 
 # Install python 3.6.
+RUN apt-get install --reinstall python3-apt
 RUN yes "" | add-apt-repository ppa:deadsnakes/ppa
 RUN apt-get update && apt-get install -y \
     python3.6 python3.6-dev python3-pip python3.6-venv && \
@@ -65,11 +66,24 @@ RUN apt-get update && apt-get install -y \
 
 RUN /install/install_pip_packages.sh
 
+# Install python 3.8.
+RUN apt-get update && apt-get install -y python3.8 python3.8-dev python3.8-venv
+RUN rm -rf /var/lib/apt/lists/*
+# Have to download get-pip.py due to a pip circular issue
+# https://stackoverflow.com/questions/58758447/how-to-fix-module-platform-has-no-attribute-linux-distribution-when-instal
+RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
+RUN python3.8 get-pip.py
+RUN python3.8 -m pip install --upgrade pip setuptools wheel
+
+
 # TODO(klimek): Figure out a better way to get the right include paths
 # forwarded when we install new packages.
 RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
 RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
 
+RUN ln -s "/usr/include/x86_64-linux-gnu/python3.8" "/dt7/usr/include/x86_64-linux-gnu/python3.8"
+RUN ln -s "/usr/include/x86_64-linux-gnu/python3.8" "/dt8/usr/include/x86_64-linux-gnu/python3.8"
+
 # Make apt work with python 3.6.
 RUN cp /usr/lib/python3/dist-packages/apt_pkg.cpython-35m-x86_64-linux-gnu.so \
        /usr/lib/python3/dist-packages/apt_pkg.so
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010 b/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010
index de6a766a7c4..516129ccd43 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010
@@ -64,6 +64,15 @@ RUN apt-get update && apt-get install -y \
 
 RUN /install/install_pip_packages.sh
 
+# Install python 3.8.
+RUN apt-get update && apt-get install -y python3.8 python3.8-dev python3.8-venv
+RUN rm -rf /var/lib/apt/lists/*
+# Have to download get-pip.py due to a pip circular issue
+# https://stackoverflow.com/questions/58758447/how-to-fix-module-platform-has-no-attribute-linux-distribution-when-instal
+RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
+RUN python3.8 get-pip.py
+RUN python3.8 -m pip install --upgrade pip setuptools wheel
+
 # TODO(klimek): Figure out a better way to get the right include paths
 # forwarded when we install new packages.
 RUN ln -s "/usr/include/x86_64-linux-gnu/python2.7" "/dt7/usr/include/x86_64-linux-gnu/python2.7"
@@ -71,3 +80,6 @@ RUN ln -s "/usr/include/x86_64-linux-gnu/python2.7" "/dt8/usr/include/x86_64-lin
 
 RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
 RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
+
+RUN ln -s "/usr/include/x86_64-linux-gnu/python3.8" "/dt7/usr/include/x86_64-linux-gnu/python3.8"
+RUN ln -s "/usr/include/x86_64-linux-gnu/python3.8" "/dt8/usr/include/x86_64-linux-gnu/python3.8"

From e722dc53dc43752ccd7b1b9cecf3cf71feef9ddd Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Thu, 20 Feb 2020 15:37:49 -0800
Subject: [PATCH 394/442] Call InitializeNumpyAPIForType() to make bfloat16
 happy.

PiperOrigin-RevId: 296313339
Change-Id: If4f61d7de9b94860107f91e39fedc05d2807538d
---
 .../xla/python/tpu_driver/client/tpu_client_extension.cc     | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
index 00e38a5f90d..f6e2fab7ef0 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc
@@ -25,6 +25,11 @@ namespace xla {
 namespace py = pybind11;
 
 PYBIND11_MODULE(tpu_client_extension, m) {
+  // Initializes the NumPy API for the use of the types module.
+  if (!InitializeNumpyAPIForTypes()) {
+    throw std::runtime_error("Unable to initialize Numpy API");
+  }
+
   py::class_<PyTpuClient, std::shared_ptr<PyTpuClient>>(m, "TpuClient")
       .def_static("Get", &PyTpuClient::Get, py::arg("worker"))
       .def("device_count", &PyTpuClient::device_count)

From 765bbd29b9801a07d4dc8be8aecd08be4c47f256 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 20 Feb 2020 15:42:58 -0800
Subject: [PATCH 395/442] Add a pass that lowers regions within
 tf_device.parallel_execute op to islands. This will ensure that computation
 in multiple logical cores will be executed concurrently.

PiperOrigin-RevId: 296314462
Change-Id: I39c22e52ad1aa813ee0c31dffd6d399461b5c04b
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |   1 +
 .../tests/parallel_execute_to_islands.mlir    | 194 +++++++++++++
 .../transforms/parallel_execute_to_islands.cc | 263 ++++++++++++++++++
 .../mlir/tensorflow/transforms/passes.h       |   4 +
 4 files changed, 462 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir
 create mode 100644 tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index f6a37c4a5f2..2cfed42d76a 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -287,6 +287,7 @@ cc_library(
         "transforms/materialize_mlir_passthrough_op.cc",
         "transforms/optimize.cc",
         "transforms/optimize_global_tensors.cc",
+        "transforms/parallel_execute_to_islands.cc",
         "transforms/promote_resources_to_args.cc",
         "transforms/raise_control_flow.cc",
         "transforms/replicate_invariant_op_hoisting.cc",
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir b/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir
new file mode 100644
index 00000000000..be23da672e5
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir
@@ -0,0 +1,194 @@
+// RUN: tf-opt %s -tf-parallel-execute-to-islands | FileCheck %s --dump-input=fail
+
+// CHECK-LABEL: func @check_regions_to_islands
+func @check_regions_to_islands() {
+  tf_executor.graph {
+    tf_executor.island() {
+      "tf_device.parallel_execute"() ({
+        tf_device.return
+      },
+      {
+        tf_device.return
+      }) {} : () -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK:      %[[ISLAND_INPUT_CTL:[a-z_0-9]*]] = tf_executor.island {
+// CHECK-NEXT:   tf_executor.yield
+// CHECK:      %[[ISLAND_1_CTL:[a-z_0-9]*]] = tf_executor.island(%[[ISLAND_INPUT_CTL]]) {
+// CHECK:        tf_executor.yield
+// CHECK:      %[[ISLAND_2_CTL:[a-z_0-9]*]] = tf_executor.island(%[[ISLAND_INPUT_CTL]]) {
+// CHECK:        tf_executor.yield
+// CHECK:      %{{.*}} = tf_executor.island(%[[ISLAND_1_CTL]], %[[ISLAND_2_CTL]]) {
+// CHECK-NEXT:   tf_executor.yield
+
+
+// CHECK-LABEL: func @check_regions_to_islands_with_inputs
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
+func @check_regions_to_islands_with_inputs(%arg0 : tensor<i1>) {
+  tf_executor.graph {
+    %1:2 = tf_executor.island {
+      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %2 : tensor<i1>
+    }
+    tf_executor.island() {
+      "tf_device.parallel_execute"() ({
+        %3 = "tf.opB"(%1#0) : (tensor<i1>) -> tensor<i1>
+        tf_device.return %3 : tensor<i1>
+      },
+      {
+        %5 = "tf.opC"(%1#0) : (tensor<i1>) -> tensor<i32>
+        tf_device.return %5 : tensor<i32>
+      }) {} : () -> (tensor<i1>, tensor<i32>)
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK:       %[[INPUT_A:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
+// CHECK-NEXT:    %[[OP_A_OUTPUT:[a-z_0-9]*]] = "tf.opA"(%[[ARG_0]]) : (tensor<i1>) -> tensor<i1>
+// CHECK-NEXT:    tf_executor.yield %[[OP_A_OUTPUT]] : tensor<i1>
+// CHECK:       %[[INPUT_0:[a-z_0-9]*]], %[[INPUT_CONTROL:[a-z_0-9]*]] = tf_executor.island {
+// CHECK-NEXT:    tf_executor.yield %[[INPUT_A]] : tensor<i1>
+// CHECK:       %[[ISLAND_1_OUTPUT:[a-z_0-9]*]], %[[ISLAND_1_CTL:[a-z_0-9]*]] = tf_executor.island {
+// CHECK-NEXT:    %[[OP_B_OUTPUT:[a-z_0-9]*]] = "tf.opB"(%[[INPUT_0]]) : (tensor<i1>) -> tensor<i1>
+// CHECK:         tf_executor.yield %[[OP_B_OUTPUT]] : tensor<i1>
+// CHECK:      %[[ISLAND_2_OUTPUT:[a-z_0-9]*]], %[[ISLAND_2_CTL:[a-z_0-9]*]] = tf_executor.island {
+// CHECK-NEXT:   %[[OP_C_OUTPUT:[a-z_0-9]*]] = "tf.opC"(%outputs_0) : (tensor<i1>) -> tensor<i32>
+// CHECK:        tf_executor.yield %[[OP_C_OUTPUT]] : tensor<i32>
+// CHECK:      %{{.*}} = tf_executor.island(%[[ISLAND_1_CTL]], %[[ISLAND_2_CTL]]) {
+// CHECK-NEXT:   tf_executor.yield
+
+
+// CHECK-LABEL: func @check_input_sink_island_forwards_control_inputs
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
+func @check_input_sink_island_forwards_control_inputs(%arg0 : tensor<i1>) {
+  tf_executor.graph {
+    %1:2 = tf_executor.island {
+      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %2 : tensor<i1>
+    }
+    %7 = tf_executor.ControlTrigger {}
+    %8 = tf_executor.ControlTrigger {}
+    tf_executor.island(%7, %8) {
+      "tf_device.parallel_execute"() ({
+        %3 = "tf.opB"(%1#0) : (tensor<i1>) -> tensor<i1>
+        tf_device.return %3 : tensor<i1>
+      },
+      {
+        %5 = "tf.opC"() : () -> tensor<i32>
+        tf_device.return %5 : tensor<i32>
+      }) {} : () -> (tensor<i1>, tensor<i32>)
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK:       %[[INPUT_A:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
+// CHECK-NEXT:    %[[OP_A_OUTPUT:[a-z_0-9]*]] = "tf.opA"(%[[ARG_0]]) : (tensor<i1>) -> tensor<i1>
+// CHECK-NEXT:    tf_executor.yield %[[OP_A_OUTPUT]] : tensor<i1>
+// CHECK: %[[CT_0:[0-9]*]] = tf_executor.ControlTrigger
+// CHECK: %[[CT_1:[0-9]*]] = tf_executor.ControlTrigger
+// CHECK:       %[[INPUT_0:[a-z_0-9]*]], %[[INPUT_CONTROL:[a-z_0-9]*]] = tf_executor.island(%[[CT_0]], %[[CT_1]]) {
+// CHECK-NEXT:    tf_executor.yield %[[INPUT_A]] : tensor<i1>
+// CHECK:       %[[ISLAND_1_OUTPUT:[a-z_0-9]*]], %[[ISLAND_1_CTL:[a-z_0-9]*]] = tf_executor.island {
+// CHECK-NEXT:    %[[OP_B_OUTPUT:[a-z_0-9]*]] = "tf.opB"(%[[INPUT_0]]) : (tensor<i1>) -> tensor<i1>
+// CHECK:         tf_executor.yield %[[OP_B_OUTPUT]] : tensor<i1>
+// CHECK:      %[[ISLAND_2_OUTPUT:[a-z_0-9]*]], %[[ISLAND_2_CTL:[a-z_0-9]*]] = tf_executor.island(%[[INPUT_CONTROL]]) {
+// CHECK-NEXT:   %[[OP_C_OUTPUT:[a-z_0-9]*]] = "tf.opC"() : () -> tensor<i32>
+// CHECK:        tf_executor.yield %[[OP_C_OUTPUT]] : tensor<i32>
+// CHECK:      %{{.*}} = tf_executor.island(%[[ISLAND_1_CTL]], %[[ISLAND_2_CTL]]) {
+// CHECK-NEXT:   tf_executor.yield
+
+
+// CHECK-LABEL: func @check_control_dep_added_when_region_does_not_have_inputs
+// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
+func @check_control_dep_added_when_region_does_not_have_inputs(%arg0 : tensor<i1>) {
+  tf_executor.graph {
+    %1:2 = tf_executor.island {
+      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %2 : tensor<i1>
+    }
+    %7:3 = tf_executor.island() {
+      %8:2 = "tf_device.parallel_execute"() (
+      {
+        %3 = "tf.opB"() : () -> tensor<i1>
+        tf_device.return %3 : tensor<i1>
+      },
+      {
+        %5 = "tf.opC"(%1#0) : (tensor<i1>) -> tensor<i32>
+        tf_device.return %5 : tensor<i32>
+       }
+       ) {} : () -> (tensor<i1>, tensor<i32>)
+
+      tf_executor.yield %8#0, %8#1 : tensor<i1>, tensor<i32>
+    }
+
+    tf_executor.island {
+      "tf.opD"(%7#0, %7#1) : (tensor<i1>, tensor<i32>) -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK:       %[[INPUT_A:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
+// CHECK-NEXT:    %[[OP_A_OUTPUT:[a-z_0-9]*]] = "tf.opA"(%[[ARG_0]]) : (tensor<i1>) -> tensor<i1>
+// CHECK-NEXT:    tf_executor.yield %[[OP_A_OUTPUT]] : tensor<i1>
+// CHECK:      %[[INPUT_0:[a-z_0-9]*]], %[[INPUT_CTL:[a-z_0-9]*]] = tf_executor.island {
+// CHECK-NEXT:   tf_executor.yield %[[INPUT_A]] : tensor<i1>
+// CHECK:      %[[ISLAND_1_OUTPUT:[a-z_0-9]*]], %{{.*}} = tf_executor.island(%[[INPUT_CTL]]) {
+// CHECK-NEXT:   %[[OP_B_OUTPUT:[a-z_0-9]*]] = "tf.opB"() : () -> tensor<i1>
+// CHECK:        tf_executor.yield %[[OP_B_OUTPUT]] : tensor<i1>
+// CHECK:      %[[ISLAND_2_OUTPUT:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
+// CHECK-NEXT:   %[[OP_C_OUTPUT:[a-z_0-9]*]] = "tf.opC"(%outputs_0) : (tensor<i1>) -> tensor<i32>
+// CHECK:        tf_executor.yield %[[OP_C_OUTPUT]] : tensor<i32>
+// CHECK:      %{{.*}} = tf_executor.island {
+// CHECK-NEXT:   tf_executor.yield %[[ISLAND_1_OUTPUT]], %[[ISLAND_2_OUTPUT]]
+
+
+// CHECK-LABEL: func @check_output_barrier_correctly_forwards_outputs
+func @check_output_barrier_correctly_forwards_outputs(%arg0 : tensor<i1>) -> tensor<i1> {
+  %0 = tf_executor.graph {
+    %1:2 = tf_executor.island {
+      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+      tf_executor.yield %2 : tensor<i1>
+    }
+    %8:3 = tf_executor.island() {
+      %7:2 = "tf_device.parallel_execute"() ({
+        %3 = "tf.opB"() : () -> tensor<i1>
+        tf_device.return %3 : tensor<i1>
+      },
+      {
+        %5 = "tf.opC"(%1#0) : (tensor<i1>) -> tensor<i32>
+        tf_device.return %5 : tensor<i32>
+      }) {} : () -> (tensor<i1>, tensor<i32>)
+      tf_executor.yield %7#0, %7#1 : tensor<i1>, tensor<i32>
+    }
+    tf_executor.fetch %8#0 : tensor<i1>
+  }
+  return %0 : tensor<i1>
+}
+
+// CHECK:       %[[INPUT_A:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
+// CHECK-NEXT:    %[[OP_A_OUTPUT:[a-z_0-9]*]] = "tf.opA"(%[[ARG_0]]) : (tensor<i1>) -> tensor<i1>
+// CHECK-NEXT:    tf_executor.yield %[[OP_A_OUTPUT]] : tensor<i1>
+// CHECK:       %[[INPUT_0:[a-z_0-9]*]], %[[INPUT_CTL:[a-z_0-9]*]] = tf_executor.island {
+// CHECK-NEXT:    tf_executor.yield %[[INPUT_A]] : tensor<i1>
+// CHECK:       %[[ISLAND_1_OUTPUT:[a-z_0-9]*]], %{{.*}} = tf_executor.island(%[[INPUT_CTL]]) {
+// CHECK-NEXT:    %[[OP_B_OUTPUT:[a-z_0-9]*]] = "tf.opB"() : () -> tensor<i1>
+// CHECK:         tf_executor.yield %[[OP_B_OUTPUT]] : tensor<i1>
+// CHECK:       %[[ISLAND_2_OUTPUT:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
+// CHECK-NEXT:    %[[OP_C_OUTPUT:[a-z_0-9]*]] = "tf.opC"(%[[INPUT_0]]) : (tensor<i1>) -> tensor<i32>
+// CHECK:         tf_executor.yield %[[OP_C_OUTPUT]] : tensor<i32>
+// CHECK:       %[[OUTPUT_SINK_OUTPUT:[a-z_0-9]*]]:2, %[[OUTPUT_SINK_CTL:[a-z_0-9]*]] = tf_executor.island {
+// CHECK-NEXT:    tf_executor.yield %[[ISLAND_1_OUTPUT]], %[[ISLAND_2_OUTPUT]] : tensor<i1>, tensor<i32>
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc b/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc
new file mode 100644
index 00000000000..5caf08c672e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc
@@ -0,0 +1,263 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This pass forms `tf_executor.island` per region of
+// `tf_device.parallel_execute`.
+//
+// For example:
+//  %1:2 = tf_executor.island {
+//    %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+//      tf_executor.yield %2 : tensor<i1>
+//  }
+//  tf_executor.island() {
+//    "tf_device.parallel_execute"() ({
+//      %3 = "tf.opB"() : () -> tensor<i1>
+//      tf_device.return %3 : tensor<i1>
+//    },
+//    {
+//      %5 = "tf.opC"(%1#0) : (tensor<i1>) -> tensor<i32>
+//      tf_device.return
+//    }) {} : () -> (tensor<i1>)
+//    tf_executor.yield
+//  }
+//  tf_executor.fetch
+//
+//  Would become:
+//    %1:2 = tf_executor.island {
+//      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+//      tf_executor.yield %2 : tensor<i1>
+//    }
+//
+//    // Input barrier sink island that forwards all inputs.
+//    %output_0, %control_1 = tf_executor.island {
+//      tf_executor.yield %1#0: tensor<i1>
+//    }
+//
+//    // Island for the first region of above parallel_execute.
+//    %output_2, %control_3 = tf_executor.island(%control_1) {
+//      %3 = "tf.opB"() : () -> tensor<i1>
+//      tf_executor.yield %3 : tensor<i1>
+//    }
+//
+//    // Island for the second region of above parallel_execute.
+//    %control_5 = tf_executor.island {
+//        %5 = "tf.opC"(%output_0) : (tensor<i1>) -> tensor<i32>
+//      tf_executor.yield
+//    }
+//
+//    // Output barrier sink island that forwards all outputs.
+//    %output_5, %control_6 = tf_executor.island(%control_5) {
+//      tf_executor.yield %output_2
+//    }
+//
+//  When tf_device.parallel_execute op is enclosed after tf_device.replicate,
+//  then this pass will run following `replicate-to-island` pass and
+//  `tf-executor-break-up-islands` pass.
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Block.h"  // TF:llvm-project
+#include "mlir/IR/Builders.h"  // TF:llvm-project
+#include "mlir/Pass/Pass.h"  // TF:llvm-project
+#include "mlir/Support/LLVM.h"  // TF:llvm-project
+#include "mlir/Support/LogicalResult.h"  // TF:llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+
+namespace mlir {
+namespace TFDevice {
+namespace {
+
+struct ParallelExecuteToIslandsPass
+    : public FunctionPass<ParallelExecuteToIslandsPass> {
+  void runOnFunction() override;
+};
+
+// Convert parallel_execute op to a set of islands where each region of
+// parallel_execute op becomes a separate island. This ensures that
+// regions of parallel_execute op gets executed concurrently.
+LogicalResult ExpandParallelExecuteToIslands(
+    tf_executor::IslandOp island_op, tf_executor::IslandOp input_sink_island,
+    tf_device::ParallelExecuteOp parallel_execute_op, OpBuilder* builder,
+    llvm::SmallVector<tf_executor::IslandOp, 4>* islands) {
+  const int num_executions =
+      parallel_execute_op.getOperation()->getNumRegions();
+  llvm::SmallVector<tf_executor::IslandOp, 4> executions;
+  executions.reserve(num_executions);
+  builder->setInsertionPoint(island_op);
+
+  auto control_type = tf_executor::ControlType::get(island_op.getContext());
+  for (int i : llvm::seq<int>(0, num_executions)) {
+    auto execute_region =
+        parallel_execute_op.GetRegionBlockWithIndex(i).getParent();
+
+    // If region does not have any inputs, then add explicit control dependency
+    // from the input sink island. This guarantees that all inputs of
+    // parallel_execute op must be materialized before any of the islands are
+    // executed.
+    llvm::SetVector<Value> region_inputs;
+    getUsedValuesDefinedAbove(*execute_region, region_inputs);
+    llvm::SmallVector<Value, 8> execution_control_inputs;
+    if (region_inputs.empty())
+      execution_control_inputs.emplace_back(input_sink_island.control());
+
+    // Collect result types and operands.
+    Operation* terminator = execute_region->front().getTerminator();
+    llvm::SmallVector<Type, 8> output_types(terminator->getOperandTypes());
+
+    // Replace terminator with YieldOp as island op always ends with yield op.
+    builder->setInsertionPoint(terminator);
+    builder->create<tf_executor::YieldOp>(terminator->getLoc(),
+                                          terminator->getOperands());
+    terminator->erase();
+
+    // Create new island for each region.
+    builder->setInsertionPoint(island_op);
+    auto execution_island = builder->create<tf_executor::IslandOp>(
+        island_op.getLoc(), output_types, control_type,
+        execution_control_inputs);
+
+    // Move over tf_device.parallel_execute body region into newly a
+    // created island.
+    execution_island.body().takeBody(*execute_region);
+    islands->push_back(execution_island);
+  }
+
+  return success();
+}
+
+// Creates an island that works as input sync point for islands. This guarantees
+// that all (implicitly captured) inputs of parallel_execute are materialized
+// before any of the islands are executed.
+tf_executor::IslandOp CreateInputBarrierIsland(
+    OpBuilder* builder, tf_executor::IslandOp island_op) {
+  builder->setInsertionPoint(island_op);
+
+  llvm::SetVector<Value> island_inputs;
+  getUsedValuesDefinedAbove(island_op.body(), island_inputs);
+
+  llvm::SmallVector<Type, 8> input_types;
+  input_types.reserve(island_inputs.size());
+  for (const auto& input_val : island_inputs)
+    input_types.emplace_back(input_val.getType());
+
+  // Create new island for that forwards all inputs.
+  auto control_type = tf_executor::ControlType::get(island_op.getContext());
+  auto input_sink_island = builder->create<tf_executor::IslandOp>(
+      island_op.getLoc(), input_types, control_type, island_op.controlInputs());
+  input_sink_island.body().push_back(new Block);
+
+  for (auto input_index_and_value : llvm::enumerate(island_inputs)) {
+    int index = input_index_and_value.index();
+    Value input_value = input_index_and_value.value();
+    replaceAllUsesInRegionWith(input_value, input_sink_island.getResult(index),
+                               island_op.body());
+  }
+
+  // Create YieldOp for the new input sink island.
+  builder->setInsertionPointToEnd(&input_sink_island.GetBody());
+  builder->create<tf_executor::YieldOp>(island_op.getLoc(),
+                                        llvm::to_vector<8>(island_inputs));
+  return input_sink_island;
+}
+
+// Creates an islands that works as output sync point. This guarantees that
+// execution of all islands must be completed before op following
+// parallel_execute runs.
+tf_executor::IslandOp CreateOutputBarrierIsland(
+    OpBuilder* builder, tf_executor::IslandOp island_op,
+    llvm::SmallVectorImpl<tf_executor::IslandOp>* islands) {
+  // Add control dependency to island operand if island output has no uses.
+  llvm::SmallVector<Value, 8> island_operands;
+  for (auto& island : *islands)
+    if (island.use_empty()) island_operands.push_back(island.control());
+
+  // Create single island forwarding all island results.
+  builder->setInsertionPoint(island_op);
+  auto island_output_sink = builder->create<tf_executor::IslandOp>(
+      island_op.getLoc(), llvm::to_vector<8>(island_op.getResultTypes()),
+      island_operands, llvm::ArrayRef<NamedAttribute>{});
+  island_output_sink.body().push_back(new Block);
+  return island_output_sink;
+}
+
+LogicalResult CreateIslandsFromParallelExecute(
+    tf_executor::IslandOp island_op,
+    tf_device::ParallelExecuteOp parallel_execute_op) {
+  OpBuilder builder(island_op);
+  auto input_sink_island = CreateInputBarrierIsland(&builder, island_op);
+
+  // Create N islands where N is the number of regions inside parallel_execute
+  // op.
+  llvm::SmallVector<tf_executor::IslandOp, 4> islands;
+  auto result = ExpandParallelExecuteToIslands(
+      island_op, input_sink_island, parallel_execute_op, &builder, &islands);
+  if (failed(result)) return result;
+
+  // Remap all results of parallel_execute op with outputs from newly
+  // created islands.
+  llvm::SmallVector<Value, 8> parallel_execute_outputs;
+  parallel_execute_outputs.reserve(
+      parallel_execute_op.getOperation()->getNumResults());
+
+  for (auto island : islands)
+    for (auto output_value : island.outputs())
+      parallel_execute_outputs.emplace_back(output_value);
+
+  parallel_execute_op.getOperation()->replaceAllUsesWith(
+      parallel_execute_outputs);
+
+  auto island_output_sink =
+      CreateOutputBarrierIsland(&builder, island_op, &islands);
+
+  // Move island YieldOp over to new single island and remap island results.
+  island_op.GetYield().getOperation()->moveBefore(
+      &island_output_sink.GetBody(), island_output_sink.GetBody().begin());
+  island_op.replaceAllUsesWith(island_output_sink);
+  island_op.erase();
+
+  return success();
+}
+
+// Finds islands with a single `tf_device.parallel_execute` and create
+// individual islands per region of parallel_execute.
+void LowerSingleIslandParallelExecuteToIslands(
+    tf_executor::IslandOp island_op) {
+  if (!has_single_element(island_op.GetBody().without_terminator())) return;
+
+  if (auto parallel_execute_op = llvm::dyn_cast<tf_device::ParallelExecuteOp>(
+          &island_op.GetBody().front()))
+    CreateIslandsFromParallelExecute(island_op, parallel_execute_op);
+}
+
+void ParallelExecuteToIslandsPass::runOnFunction() {
+  getFunction().walk([&](tf_executor::IslandOp island_op) {
+    LowerSingleIslandParallelExecuteToIslands(island_op);
+  });
+}
+}  // anonymous namespace
+
+std::unique_ptr<OpPassBase<FuncOp>> CreateParallelExecuteToIslandsPass() {
+  return std::make_unique<ParallelExecuteToIslandsPass>();
+}
+
+static PassRegistration<ParallelExecuteToIslandsPass> pass(
+    "tf-parallel-execute-to-islands",
+    "Lowers device parallel_execute to executor islands");
+
+}  // namespace TFDevice
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index ad6fc683b6d..b9e7aae7c61 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -165,6 +165,10 @@ std::unique_ptr<OpPassBase<FuncOp>> CreateReplicateInvariantOpHoistingPass();
 // `tf_device.replicate` island.
 std::unique_ptr<OpPassBase<FuncOp>> CreateReplicateToIslandPass();
 
+// Creates a pass that creates `tf_executor.island` from a single
+// `tf_device.parallel_execute` island.
+std::unique_ptr<OpPassBase<FuncOp>> CreateParallelExecuteToIslandsPass();
+
 // Creates a pass that annotates whether a LaunchFuncOp's parameters have the
 // same data across replicas.
 std::unique_ptr<OpPassBase<ModuleOp>> CreateAnnotateParameterReplicationPass();

From e4f82e501e780c94f0a48d327f53681b513c1652 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 20 Feb 2020 15:51:29 -0800
Subject: [PATCH 396/442] tf.Benchmark: add mean and stdev of the wall time to
 unreported extras.

PiperOrigin-RevId: 296316355
Change-Id: I2e2de5de0b92c06c6b3d677ca5ea57046e0209b0
---
 tensorflow/python/platform/benchmark.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tensorflow/python/platform/benchmark.py b/tensorflow/python/platform/benchmark.py
index bee27d1ca9f..dcfa4d1ef1a 100644
--- a/tensorflow/python/platform/benchmark.py
+++ b/tensorflow/python/platform/benchmark.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import math
 import numbers
 import os
 import re
@@ -379,6 +380,16 @@ class TensorFlowBenchmark(Benchmark):
       lm1 = l - 1
       return (s[l//2] + s[lm1//2]) / 2.0
 
+    def _mean_and_stdev(x):
+      if not x:
+        return -1, -1
+      l = len(x)
+      mean = sum(x) / l
+      if l == 1:
+        return mean, -1
+      variance = sum([(e - mean) * (e - mean) for e in x]) / (l - 1)
+      return mean, math.sqrt(variance)
+
     median_delta = _median(deltas)
 
     benchmark_values = {
@@ -389,6 +400,10 @@ class TensorFlowBenchmark(Benchmark):
         "throughput": mbs / median_delta
     }
     self.report_benchmark(**benchmark_values)
+
+    mean_delta, stdev_delta = _mean_and_stdev(deltas)
+    unreported_extras["wall_time_mean"] = mean_delta
+    unreported_extras["wall_time_stdev"] = stdev_delta
     benchmark_values["extras"].update(unreported_extras)
     return benchmark_values
 

From 9553f81edfd2b4f670ee2d05ca42132dc4d93f3c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 20 Feb 2020 15:58:51 -0800
Subject: [PATCH 397/442] Allow using tf.identity to explicitly copy tensors.

When device placement policy is 'explicit', and a tensor copy is
required for an operation input, the program will fail with an error
message that suggests a solution of the form

  with tf.device(dev): x = tf.identity(x)

This does not work currently. This change adds a policy exception for
the "Identity" operation, so the above starts working.

PiperOrigin-RevId: 296317942
Change-Id: Ie2e79c8b095bed9376976a8dcd1b49a0af9b7dc9
---
 .../core/common_runtime/eager/context.cc      |  6 ++--
 .../core/common_runtime/eager/execute.cc      |  5 +++
 .../python/eager/device_placement_test.py     | 35 +++++++++++++++++--
 3 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 7c7f1b3f498..30cc47b6714 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -231,12 +231,14 @@ Status EagerContext::SelectDevice(DeviceNameUtils::ParsedName preferred,
   if (DeviceNameUtils::HasSomeDetails(preferred)) {
     return errors::InvalidArgument(
         "Could not satisfy device specification '", preferred,
-        "'. All available devices [",
+        "'. enable_soft_placement=", AllowSoftPlacement(),
+        ". All available devices [",
         absl::StrJoin(DevicesToString(existing), ", "), "].");
   }
   return errors::InvalidArgument(
       "No supported device found in available devices [",
-      absl::StrJoin(DevicesToString(existing), ", "), "].");
+      absl::StrJoin(DevicesToString(existing), ", "),
+      "]. enable_soft_placement=", AllowSoftPlacement(), ".");
 }
 
 void EagerContext::ResetClusterFLR(
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index bc1bf9c1610..2829b9affdc 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -121,6 +121,11 @@ Status CopyInputToExpectedDevice(EagerContext* ctx, EagerOperation* op,
       }
       TF_FALLTHROUGH_INTENDED;
     case DEVICE_PLACEMENT_EXPLICIT:
+      // tf.identity is allowed to copy, as indicated in the error message
+      // below.
+      if (op->Name() == "Identity" || op->Name() == "IdentityN") {
+        break;
+      }
       return errors::InvalidArgument(
           "Tensors on conflicting devices:"
           " cannot compute ",
diff --git a/tensorflow/python/eager/device_placement_test.py b/tensorflow/python/eager/device_placement_test.py
index 4318313c597..32ca6d3a826 100644
--- a/tensorflow/python/eager/device_placement_test.py
+++ b/tensorflow/python/eager/device_placement_test.py
@@ -27,12 +27,16 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 
 
 class SoftDevicePlacementTest(test.TestCase):
 
   def setUp(self):
-    context.context().soft_device_placement = True
+    super(SoftDevicePlacementTest, self).setUp()
+    context._context = None
+    ops.enable_eager_execution_internal()
+    config.set_soft_device_placement(enabled=True)
     context.context().log_device_placement = True
 
   @test_util.run_gpu_only
@@ -87,10 +91,37 @@ class SoftDevicePlacementTest(test.TestCase):
     self.assertIn('GPU:0', c.device)
 
 
+class HardDevicePlacementTest(test.TestCase):
+
+  def setUp(self):
+    super(HardDevicePlacementTest, self).setUp()
+    context._context = None
+    ops.enable_eager_execution_internal()
+    config.set_soft_device_placement(enabled=False)
+    context.context().log_device_placement = True
+    self.assertEqual(config.get_soft_device_placement(), False)
+    self.assertEqual(context.context().soft_device_placement, False)
+
+  @test_util.run_gpu_only
+  def testIdentityCanCopy(self):
+    config.set_device_policy('explicit')
+    with ops.device('CPU:0'):
+      x = constant_op.constant(1.0)
+      self.assertIn('CPU:0', x.device)
+      self.assertIn('CPU:0', x.backing_device)
+    with ops.device('GPU:0'):
+      y = array_ops.identity(x)
+      self.assertIn('GPU:0', y.device)
+      self.assertIn('GPU:0', y.backing_device)
+
+
 class ClusterPlacementTest(test.TestCase):
 
   def setUp(self):
-    context.context().soft_device_placement = True
+    super(ClusterPlacementTest, self).setUp()
+    context._context = None
+    ops.enable_eager_execution_internal()
+    config.set_soft_device_placement(enabled=True)
     context.context().log_device_placement = True
     workers, _ = test_util.create_local_cluster(2, 0)
     remote.connect_to_remote_host([workers[0].target, workers[1].target])

From 10aff5d518393c73f2a068b069031d1bb2df0ec3 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Thu, 20 Feb 2020 16:02:32 -0800
Subject: [PATCH 398/442] Forwardprop: fix nested forwardprop of
 non-differentiable ops

A special case was expecting that we didn't have any tangents, when in fact we just want to discard them.

PiperOrigin-RevId: 296318800
Change-Id: I7198596435f294333e00a2dcfe5ac8ec31d0b28c
---
 tensorflow/python/eager/forwardprop_test.py | 8 ++++++++
 tensorflow/python/eager/function.py         | 7 ++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index fed04aec270..71473e51706 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -230,6 +230,14 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
         ))
     self.assertAllClose([2. * 5. + 3. * 4.], self.evaluate(vp))
 
+  def testNonDifferentiableOpWithInputTangent(self):
+    x = constant_op.constant(1.)
+    with forwardprop.ForwardAccumulator(x, 2.) as acc1:
+      with forwardprop.ForwardAccumulator(x, 2.) as acc2:
+        y = array_ops.zeros_like(x)
+      self.assertIsNone(acc1.jvp(y))
+    self.assertIsNone(acc2.jvp(y))
+
   def testJVPFunctionUsedByAccumulatorForOps(self):
     previous_fn = forwardprop._jvp_dispatch
     try:
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 895a5de7765..c16060422b8 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -981,7 +981,7 @@ class _TapeGradientFunctions(object):
         self._func_graph.outputs,
         forward_function_attr)
 
-    if not self._func_graph.outputs or not input_tangents:
+    if not input_tangents:
       # There is no need to special-case forwardprop, so we can return the
       # forward+backward pair we've created without further wrapping.
       return (forward_function, self._func_graph, backward_function,
@@ -1085,6 +1085,11 @@ class _TapeGradientFunctions(object):
              "StatefulPartitionedCall": gradient_function}):
           forward_outputs = forward_function.call(context.context(),
                                                   forward_inputs)
+          if isinstance(forward_outputs, ops.Operation):
+            # _wrapped_backward_function expects a list, but if the function has
+            # no outputs its call() returns an Operation. We need to undo that
+            # so we don't cause problems later.
+            forward_outputs = []
         py_backward, _ = self._wrap_backward_function(
             self._func_graph, backward_function, forward_outputs)
       # We will never request backward tape gradients for this operation

From 7f6685951b14c217619f2ac14c4d73ddf60b64a7 Mon Sep 17 00:00:00 2001
From: Jian Li <jianlijianli@google.com>
Date: Thu, 20 Feb 2020 16:07:36 -0800
Subject: [PATCH 399/442] Add another integer LSTM reference runtime for
 special hardware platform. Its quantization will be added separately.

PiperOrigin-RevId: 296320108
Change-Id: Ifb3a22667322a5af49426b5a1fc4066a50beac51
---
 tensorflow/lite/kernels/BUILD                 |   4 +-
 .../internal/optimized/neon_tensor_utils.h    |  58 ++
 .../internal/optimized/sse_tensor_utils.h     |  58 ++
 .../reference/portable_tensor_utils.cc        | 182 ++++++
 .../reference/portable_tensor_utils.h         |  58 ++
 .../reference/portable_tensor_utils_impl.h    |  39 ++
 .../lite/kernels/internal/tensor_utils.h      |  48 ++
 .../kernels/internal/tensor_utils_test.cc     | 201 +++++++
 tensorflow/lite/kernels/lstm.cc               | 541 ++++++++++++++++--
 tensorflow/lite/kernels/lstm_eval.cc          | 452 ++++++++++++++-
 tensorflow/lite/kernels/lstm_eval.h           |  49 +-
 tensorflow/lite/kernels/lstm_test.cc          | 477 +++++++++++++++
 12 files changed, 2096 insertions(+), 71 deletions(-)

diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 82d7fb33e08..2974b26f574 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -614,12 +614,12 @@ cc_library(
         ":op_macros",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/experimental/ruy/profiler:instrumentation",
+        "//tensorflow/lite/kernels/internal:common",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:kernel_utils",
-        "//tensorflow/lite/kernels/internal:optimized_base",
+        "//tensorflow/lite/kernels/internal:quantization_util",
         "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/kernels/internal:tensor_utils",
-        "@gemmlowp",
     ],
 )
 
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
index f82926825ed..ff16fa06ec1 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -111,6 +111,31 @@ void MatrixBatchVectorMultiplyAccumulate(
                    n_output, output_zp, scratch, output, context);
 }
 
+void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
+                               const int8_t* input_to_gate_weights,
+                               int32_t input_to_gate_effective_scale_a,
+                               int32_t input_to_gate_effective_scale_b,
+                               int32_t n_batch, int32_t n_input, int32_t n_cell,
+                               int8_t* gate_output, int8_t gate_output_zp) {
+  PortableMatrixBatchVectorMultiply(
+      input, input_zeropoint, input_to_gate_weights,
+      input_to_gate_effective_scale_a, input_to_gate_effective_scale_b, n_batch,
+      n_input, n_cell, gate_output, gate_output_zp);
+}
+
+void MatrixBatchVectorMultiply(const int16_t* hidden,
+                               const int8_t* hidden_to_output_weights,
+                               int32_t proj_effective_scale_a,
+                               int32_t proj_effective_scale_b,
+                               const int32_t* gate_bias, int32_t n_batch,
+                               int32_t n_hidden, int32_t n_output,
+                               int32_t output_zp, int8_t* proj_output) {
+  PortableMatrixBatchVectorMultiply(hidden, hidden_to_output_weights,
+                                    proj_effective_scale_a,
+                                    proj_effective_scale_b, gate_bias, n_batch,
+                                    n_hidden, n_output, output_zp, proj_output);
+}
+
 void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
                                     int32_t n_row, int32_t n_col,
                                     int32_t* output) {
@@ -127,16 +152,36 @@ void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
                    n_batch, n_input, output);
 }
 
+void ApplyLayerNormFloat(const int16_t* input,
+                         const int16_t* layer_norm_weights,
+                         int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
+                         const int32_t* bias, int n_batch, int n_input,
+                         int16_t* output) {
+  PortableApplyLayerNormFloat(input, layer_norm_weights, layer_norm_scale_a,
+                              layer_norm_scale_b, bias, n_batch, n_input,
+                              output);
+}
+
 void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
                   int16_t* output) {
   NEON_OR_PORTABLE(ApplySigmoid, input, n_batch, n_input, output);
 }
 
+void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                       int16_t* output) {
+  PortableApplySigmoidFloat(input, n_batch, n_input, output);
+}
+
 void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch,
                int32_t n_input, int16_t* output) {
   NEON_OR_PORTABLE(ApplyTanh, integer_bits, input, n_batch, n_input, output);
 }
 
+void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                    int32_t integer_bits, int16_t* output) {
+  PortableApplyTanhFloat(input, n_batch, n_input, integer_bits, output);
+}
+
 void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
               int n_input, int shift, int16_t* output) {
   NEON_OR_PORTABLE(CwiseMul, input_1, input_2, n_batch, n_input, shift, output);
@@ -260,6 +305,19 @@ void MeanStddevNormalization(const float* input_vector, float* output_vector,
   PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
 }
 
+void TwoGateSaturationgAdd(const int8_t* input, int8_t input_zp,
+                           const int8_t* recurrent, int8_t recurrent_zp,
+                           int32_t input_effective_scale_a,
+                           int32_t input_effective_scale_b,
+                           int32_t recurrent_effective_scale_a,
+                           int32_t recurrent_effective_scale_b, int32_t n_batch,
+                           int32_t n_cell, int16_t* output) {
+  PortableTwoGateSaturationgAdd(
+      input, input_zp, recurrent, recurrent_zp, input_effective_scale_a,
+      input_effective_scale_b, recurrent_effective_scale_a,
+      recurrent_effective_scale_b, n_batch, n_cell, output);
+}
+
 }  // namespace tensor_utils
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
index a0cbcd2d9bf..641c1df336d 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
@@ -126,6 +126,31 @@ void MatrixBatchVectorMultiplyAccumulate(
       shift, n_batch, n_input, n_output, output_zp, scratch, output, context);
 }
 
+void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
+                               const int8_t* input_to_gate_weights,
+                               int32_t input_to_gate_effective_scale_a,
+                               int32_t input_to_gate_effective_scale_b,
+                               int32_t n_batch, int32_t n_input, int32_t n_cell,
+                               int8_t* gate_output, int8_t gate_output_zp) {
+  PortableMatrixBatchVectorMultiply(
+      input, input_zeropoint, input_to_gate_weights,
+      input_to_gate_effective_scale_a, input_to_gate_effective_scale_b, n_batch,
+      n_input, n_cell, gate_output, gate_output_zp);
+}
+
+void MatrixBatchVectorMultiply(const int16_t* hidden,
+                               const int8_t* hidden_to_output_weights,
+                               int32_t proj_effective_scale_a,
+                               int32_t proj_effective_scale_b,
+                               const int32_t* gate_bias, int32_t n_batch,
+                               int32_t n_hidden, int32_t n_output,
+                               int32_t output_zp, int8_t* proj_output) {
+  PortableMatrixBatchVectorMultiply(hidden, hidden_to_output_weights,
+                                    proj_effective_scale_a,
+                                    proj_effective_scale_b, gate_bias, n_batch,
+                                    n_hidden, n_output, output_zp, proj_output);
+}
+
 void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
                                     int32_t n_row, int32_t n_col,
                                     int32_t* output) {
@@ -141,16 +166,36 @@ void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
                          output);
 }
 
+void ApplyLayerNormFloat(const int16_t* input,
+                         const int16_t* layer_norm_weights,
+                         int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
+                         const int32_t* bias, int n_batch, int n_input,
+                         int16_t* output) {
+  PortableApplyLayerNormFloat(input, layer_norm_weights, layer_norm_scale_a,
+                              layer_norm_scale_b, bias, n_batch, n_input,
+                              output);
+}
+
 void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
                   int16_t* output) {
   PortableApplySigmoid(input, n_batch, n_input, output);
 }
 
+void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                       int16_t* output) {
+  PortableApplySigmoidFloat(input, n_batch, n_input, output);
+}
+
 void ApplyTanh(int32_t intger_bits, const int16_t* input, int32_t n_batch,
                int32_t n_input, int16_t* output) {
   PortableApplyTanh(intger_bits, input, n_batch, n_input, output);
 }
 
+void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                    int32_t integer_bits, int16_t* output) {
+  PortableApplyTanhFloat(input, n_batch, n_input, integer_bits, output);
+}
+
 void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
               int n_input, int shift, int16_t* output) {
   PortableCwiseMul(input_1, input_2, n_batch, n_input, shift, output);
@@ -274,6 +319,19 @@ void MeanStddevNormalization(const float* input_vector, float* output_vector,
   PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
 }
 
+void TwoGateSaturationgAdd(const int8_t* input, int8_t input_zp,
+                           const int8_t* recurrent, int8_t recurrent_zp,
+                           int32_t input_effective_scale_a,
+                           int32_t input_effective_scale_b,
+                           int32_t recurrent_effective_scale_a,
+                           int32_t recurrent_effective_scale_b, int32_t n_batch,
+                           int32_t n_cell, int16_t* output) {
+  PortableTwoGateSaturationgAdd(
+      input, input_zp, recurrent, recurrent_zp, input_effective_scale_a,
+      input_effective_scale_b, recurrent_effective_scale_a,
+      recurrent_effective_scale_b, n_batch, n_cell, output);
+}
+
 }  // namespace tensor_utils
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index 5d7907b20ef..4fceb905426 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -340,6 +340,74 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
       n_output, output_zp, output);
 }
 
+void PortableMatrixBatchVectorMultiply(const int8_t* input,
+                                       int32_t input_zeropoint,
+                                       const int8_t* input_to_gate_weights,
+                                       int32_t input_to_gate_effective_scale_a,
+                                       int32_t input_to_gate_effective_scale_b,
+                                       int32_t n_batch, int32_t n_input,
+                                       int32_t n_cell, int8_t* gate_output,
+                                       int8_t gate_output_zp) {
+  const int32_t int8_max = std::numeric_limits<int8>::max();
+  const int32_t int8_min = std::numeric_limits<int8>::min();
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int row = 0; row < n_cell; ++row) {
+      int32_t acc = 0;
+      for (int col = 0; col < n_input; ++col) {
+        int32_t input_val = input[batch * n_input + col];
+        int8_t weights_val = input_to_gate_weights[row * n_input + col];
+        acc += (input_val - input_zeropoint) * weights_val;
+      }
+      acc = MultiplyByQuantizedMultiplier(acc, input_to_gate_effective_scale_a,
+                                          input_to_gate_effective_scale_b);
+      acc += gate_output_zp;
+      if (acc > int8_max) {
+        acc = int8_max;
+      }
+      if (acc < int8_min) {
+        acc = int8_min;
+      }
+      gate_output[batch * n_cell + row] = static_cast<int8_t>(acc);
+    }
+  }
+}
+
+void PortableMatrixBatchVectorMultiply(
+    const int16_t* hidden, const int8_t* hidden_to_output_weights,
+    int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
+    const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
+    int32_t n_output, int32_t output_zp, int8_t* proj_output) {
+  const int16_t int8_max = std::numeric_limits<int8>::max();
+  const int16_t int8_min = std::numeric_limits<int8>::min();
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int row = 0; row < n_output; ++row) {
+      int64_t acc = gate_bias[row];
+      for (int col = 0; col < n_hidden; ++col) {
+        int16_t input_val = hidden[batch * n_hidden + col];
+        int8_t weights_val = hidden_to_output_weights[row * n_hidden + col];
+        int64_t curr = acc;
+        acc += input_val * weights_val;
+        if (input_val * weights_val > 0 && acc < curr) {
+          acc = std::numeric_limits<int32>::max();
+        }
+        if (input_val * weights_val < 0 && acc > curr) {
+          acc = std::numeric_limits<int32>::min();
+        }
+      }
+      acc = MultiplyByQuantizedMultiplier(acc, proj_effective_scale_a,
+                                          proj_effective_scale_b);
+      acc += output_zp;
+      if (acc > int8_max) {
+        acc = int8_max;
+      }
+      if (acc < int8_min) {
+        acc = int8_min;
+      }
+      proj_output[batch * n_output + row] = acc;
+    }
+  }
+}
+
 void PortableApplyLayerNorm(const int16_t* input,
                             const int16_t* layer_norm_weights,
                             const int32_t* bias, int32_t layer_norm_scale_a,
@@ -390,6 +458,52 @@ void PortableApplyLayerNorm(const int16_t* input,
   }
 }
 
+void PortableApplyLayerNormFloat(const int16_t* input,
+                                 const int16_t* layer_norm_weights,
+                                 int32_t layer_norm_scale_a,
+                                 int32_t layer_norm_scale_b,
+                                 const int32_t* bias, int n_batch, int n_input,
+                                 int16_t* output) {
+  const int32_t int16_max = std::numeric_limits<int16>::max();
+  const int32_t int16_min = std::numeric_limits<int16>::min();
+  // This is to surpress a lint warning.
+  const double two = 2.0;
+  const float layer_norm_scale =
+      layer_norm_scale_a *
+      std::pow(two, static_cast<double>(layer_norm_scale_b - 31));
+  const float bias_scale = std::pow(two, -10) * layer_norm_scale;
+
+  for (int batch = 0; batch < n_batch; ++batch) {
+    float sum = 0.0f;
+    float sum_sq = 0.0f;
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      const float value = static_cast<float>(input[index]);
+      sum += value;
+      sum_sq += value * value;
+    }
+    const float mean = sum / n_input;
+    float stddev_inv = 0.0f;
+    const float variance = sum_sq / n_input - mean * mean;
+    if (variance == 0) {
+      stddev_inv = 1.0f / sqrt(1e-8);
+    } else {
+      stddev_inv = 1.0f / sqrt(variance);
+    }
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      const float normalized_value =
+          (static_cast<float>(input[index]) - mean) * stddev_inv;
+      const float weighted_normalized_value =
+          normalized_value * layer_norm_weights[i] * layer_norm_scale +
+          bias[i] * bias_scale;
+      const int32_t quant_output = static_cast<int32>(
+          std::round(weighted_normalized_value * std::pow(2, 12)));
+      output[index] = std::min(int16_max, std::max(int16_min, quant_output));
+    }
+  }
+}
+
 void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
                                             int32_t scalar, int32_t n_row,
                                             int32_t n_col, int32_t* output) {
@@ -416,6 +530,24 @@ void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
   }
 }
 
+void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
+                               int32_t n_input, int16_t* output) {
+  const int32_t int16_max = std::numeric_limits<int16>::max();
+  const int32_t int16_min = std::numeric_limits<int16>::min();
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      const float float_input = input[index] * std::pow(2, -12);
+      const float float_output = 1.0f / (1.0f + std::exp(-float_input));
+      const int32_t quant_output =
+          static_cast<int32>(float_output * std::pow(2, 15));
+      const int32_t quant_output_clamped =
+          std::min(int16_max, std::max(int16_min, quant_output));
+      output[index] = static_cast<int16>(quant_output_clamped);
+    }
+  }
+}
+
 template <int IntegerBits>
 void PortableApplyTanhImpl(const int16_t* input, int32_t n_batch,
                            int32_t n_input, int16_t* output) {
@@ -452,6 +584,27 @@ void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
 #undef DISPATCH_TANH
 }
 
+void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
+                            int32_t n_input, int32_t integer_bits,
+                            int16_t* output) {
+  const int32_t int16_max = std::numeric_limits<int16>::max();
+  const int32_t int16_min = std::numeric_limits<int16>::min();
+  const double two = 2.0;
+  for (int batch = 0; batch < n_batch; ++batch) {
+    for (int i = 0; i < n_input; ++i) {
+      const int index = batch * n_input + i;
+      const float float_input =
+          input[index] * std::pow(two, static_cast<double>(integer_bits));
+      const float float_output = std::tanh(float_input);
+      const int32_t quant_output =
+          static_cast<int32>(float_output * std::pow(2, 15));
+      const int32_t quant_output_clamped =
+          std::min(int16_max, std::max(int16_min, quant_output));
+      output[index] = static_cast<int16>(quant_output_clamped);
+    }
+  }
+}
+
 void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
                       int n_batch, int n_input, int shift, int16_t* output) {
   for (int batch = 0; batch < n_batch; ++batch) {
@@ -666,5 +819,34 @@ void PortableMeanStddevNormalization(const float* input_vector,
   }
 }
 
+void PortableTwoGateSaturationgAdd(const int8_t* input, int8_t input_zp,
+                                   const int8_t* recurrent, int8_t recurrent_zp,
+                                   int32_t input_effective_scale_a,
+                                   int32_t input_effective_scale_b,
+                                   int32_t recurrent_effective_scale_a,
+                                   int32_t recurrent_effective_scale_b,
+                                   int32_t n_batch, int32_t n_cell,
+                                   int16_t* output) {
+  const int32_t int16_max = std::numeric_limits<int16>::max();
+  const int32_t int16_min = std::numeric_limits<int16>::min();
+  for (int i = 0; i < n_batch * n_cell; ++i) {
+    int32_t x = static_cast<int32>(input[i]) - static_cast<int32>(input_zp);
+    int32_t h =
+        static_cast<int32>(recurrent[i]) - static_cast<int32>(recurrent_zp);
+    int32_t x_scaled = MultiplyByQuantizedMultiplier(x, input_effective_scale_a,
+                                                     input_effective_scale_b);
+    int32_t h_scaled = MultiplyByQuantizedMultiplier(
+        h, recurrent_effective_scale_a, recurrent_effective_scale_b);
+    int32_t y = h_scaled + x_scaled;
+    if (y > int16_max) {
+      y = int16_max;
+    }
+    if (y < int16_min) {
+      y = int16_min;
+    }
+    output[i] = static_cast<int16_t>(y);
+  }
+}
+
 }  // namespace tensor_utils
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index f5ae5ee173f..43177cc0d9d 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -152,6 +152,31 @@ void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
   PortableMatrixScalarMultiplyAccumulate(matrix, scalar, n_row, n_col, output);
 }
 
+void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
+                               const int8_t* input_to_gate_weights,
+                               int32_t input_to_gate_effective_scale_a,
+                               int32_t input_to_gate_effective_scale_b,
+                               int32_t n_batch, int32_t n_input, int32_t n_cell,
+                               int8_t* gate_output, int8_t gate_output_zp) {
+  PortableMatrixBatchVectorMultiply(
+      input, input_zeropoint, input_to_gate_weights,
+      input_to_gate_effective_scale_a, input_to_gate_effective_scale_b, n_batch,
+      n_input, n_cell, gate_output, gate_output_zp);
+}
+
+void MatrixBatchVectorMultiply(const int16_t* hidden,
+                               const int8_t* hidden_to_output_weights,
+                               int32_t proj_effective_scale_a,
+                               int32_t proj_effective_scale_b,
+                               const int32_t* gate_bias, int32_t n_batch,
+                               int32_t n_hidden, int32_t n_output,
+                               int32_t output_zp, int8_t* proj_output) {
+  PortableMatrixBatchVectorMultiply(hidden, hidden_to_output_weights,
+                                    proj_effective_scale_a,
+                                    proj_effective_scale_b, gate_bias, n_batch,
+                                    n_hidden, n_output, output_zp, proj_output);
+}
+
 void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
                     const int32_t* bias, int32_t layer_norm_scale_a,
                     int32_t layer_norm_scale_b, int32_t variance_limit,
@@ -161,16 +186,36 @@ void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
                          output);
 }
 
+void ApplyLayerNormFloat(const int16_t* input,
+                         const int16_t* layer_norm_weights,
+                         int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
+                         const int32_t* bias, int n_batch, int n_input,
+                         int16_t* output) {
+  PortableApplyLayerNormFloat(input, layer_norm_weights, layer_norm_scale_a,
+                              layer_norm_scale_b, bias, n_batch, n_input,
+                              output);
+}
+
 void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
                   int16_t* output) {
   PortableApplySigmoid(input, n_batch, n_input, output);
 }
 
+void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                       int16_t* output) {
+  PortableApplySigmoidFloat(input, n_batch, n_input, output);
+}
+
 void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch,
                int32_t n_input, int16_t* output) {
   PortableApplyTanh(integer_bits, input, n_batch, n_input, output);
 }
 
+void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                    int32_t integer_bits, int16_t* output) {
+  PortableApplyTanhFloat(input, n_batch, n_input, integer_bits, output);
+}
+
 void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
               int n_input, int shift, int16_t* output) {
   PortableCwiseMul(input_1, input_2, n_batch, n_input, shift, output);
@@ -265,6 +310,19 @@ void MeanStddevNormalization(const float* input_vector, float* output_vector,
   PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
 }
 
+void TwoGateSaturationgAdd(const int8_t* input, int8_t input_zp,
+                           const int8_t* recurrent, int8_t recurrent_zp,
+                           int32_t input_effective_scale_a,
+                           int32_t input_effective_scale_b,
+                           int32_t recurrent_effective_scale_a,
+                           int32_t recurrent_effective_scale_b, int32_t n_batch,
+                           int32_t n_cell, int16_t* output) {
+  PortableTwoGateSaturationgAdd(
+      input, input_zp, recurrent, recurrent_zp, input_effective_scale_a,
+      input_effective_scale_b, recurrent_effective_scale_a,
+      recurrent_effective_scale_b, n_batch, n_cell, output);
+}
+
 }  // namespace tensor_utils
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
index fb86aef1a19..b14d4c5b3f0 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@@ -122,6 +122,21 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
     int32_t* scratch, int8_t* output, CpuBackendContext* context);
 
+void PortableMatrixBatchVectorMultiply(const int8_t* input,
+                                       int32_t input_zeropoint,
+                                       const int8_t* input_to_gate_weights,
+                                       int32_t input_to_gate_effective_scale_a,
+                                       int32_t input_to_gate_effective_scale_b,
+                                       int32_t n_batch, int32_t n_input,
+                                       int32_t n_cell, int8_t* gate_output,
+                                       int8_t gate_output_zp);
+
+void PortableMatrixBatchVectorMultiply(
+    const int16_t* hidden, const int8_t* hidden_to_output_weights,
+    int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
+    const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
+    int32_t n_output, int32_t output_zp, int8_t* proj_output);
+
 void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
                                             int32_t scalar, int32_t n_row,
                                             int32_t n_col, int32_t* output);
@@ -132,12 +147,26 @@ void PortableApplyLayerNorm(const int16_t* input,
                             int32_t layer_norm_scale_b, int32_t variance_limit,
                             int n_batch, int n_input, int16_t* output);
 
+void PortableApplyLayerNormFloat(const int16_t* input,
+                                 const int16_t* layer_norm_weights,
+                                 int32_t layer_norm_scale_a,
+                                 int32_t layer_norm_scale_b,
+                                 const int32_t* bias, int n_batch, int n_input,
+                                 int16_t* output);
+
 void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
                           int32_t n_input, int16_t* output);
 
+void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
+                               int32_t n_input, int16_t* output);
+
 void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
                        int32_t n_batch, int32_t n_input, int16_t* output);
 
+void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
+                            int32_t n_input, int32_t integer_bits,
+                            int16_t* output);
+
 void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
                       int n_batch, int n_input, int shift, int16_t* output);
 
@@ -197,6 +226,16 @@ void PortableMeanStddevNormalization(const float* input_vector,
                                      float* output_vector, int v_size,
                                      int n_batch);
 
+// Saturate Add.
+void PortableTwoGateSaturationgAdd(const int8_t* input, int8_t input_zp,
+                                   const int8_t* recurrent, int8_t recurrent_zp,
+                                   int32_t input_effective_scale_a,
+                                   int32_t input_effective_scale_b,
+                                   int32_t recurrent_effective_scale_a,
+                                   int32_t recurrent_effective_scale_b,
+                                   int32_t n_batch, int32_t n_cell,
+                                   int16_t* output);
+
 }  // namespace tensor_utils
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h
index a939da1448e..58a897cc5fe 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/tensor_utils.h
@@ -209,6 +209,27 @@ void MatrixBatchVectorMultiplyAccumulate(
     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
     int32_t* scratch, int8_t* output, CpuBackendContext* context);
 
+// Same as the above 8, 8, 8 integer matmul except for the presence of zero
+// point and non-accumulative.
+// TODO(b/148688698): remove this function by folding zero point calculation in
+// prepare() function.
+void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
+                               const int8_t* input_to_gate_weights,
+                               int32_t input_to_gate_effective_scale_a,
+                               int32_t input_to_gate_effective_scale_b,
+                               int32_t n_batch, int32_t n_input, int32_t n_cell,
+                               int8_t* gate_output, int8_t gate_output_zp);
+
+// Same as above but has 16 bit and 8 bit input and 8 bit output.
+// Used in projection when hidden is 16bit.
+void MatrixBatchVectorMultiply(const int16_t* hidden,
+                               const int8_t* hidden_to_output_weights,
+                               int32_t proj_effective_scale_a,
+                               int32_t proj_effective_scale_b,
+                               const int32_t* gate_bias, int32_t n_batch,
+                               int32_t n_hidden, int32_t n_output,
+                               int32_t output_zp, int8_t* proj_output);
+
 // Multiplies a matrix with a scalar and reduce the result on each row to a
 // scalar.
 // Parameters:
@@ -241,6 +262,13 @@ void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
                     int32_t layer_norm_scale_b, int32_t variance_limit,
                     int n_batch, int n_input, int16_t* output);
 
+// Same as above but the internal calculation is done in float.
+void ApplyLayerNormFloat(const int16_t* input,
+                         const int16_t* layer_norm_weights,
+                         int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
+                         const int32_t* bias, int n_batch, int n_input,
+                         int16_t* output);
+
 // Apply Sigmoid to a quantized vector.
 // Parameters:
 //     - input: batch vector of size n_batch * n_input; 16 bit.
@@ -251,6 +279,10 @@ void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
 void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
                   int16_t* output);
 
+// Same as above but the internal calcualtion is float.
+void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                       int16_t* output);
+
 // Apply Tanh to a quantized vector.
 // Parameters:
 //     - integer_bits: the integer bits of the input.
@@ -263,6 +295,12 @@ void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
 void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch,
                int32_t n_input, int16_t* output);
 
+// Apply Tanh to a quantized vector. Tbe internal calculation is in float.
+//    - Input has 2^(integer_bits) as scale.
+//    - Output has Q0.15 as scale.
+void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
+                    int32_t integer_bits, int16_t* output);
+
 // Element-wise multiplication of two quantized vectors.
 // Parameters:
 //     - input_1: batch vector of size n_batch * n_input; 16 bit.
@@ -553,6 +591,16 @@ void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
 // Layer norm for each batch.
 void MeanStddevNormalization(const float* input_vector, float* output_vector,
                              int v_size, int n_batch);
+
+// Saturate Add with rescale on both inputs.
+void TwoGateSaturationgAdd(const int8_t* input, int8_t input_zp,
+                           const int8_t* recurrent, int8_t recurrent_zp,
+                           int32_t input_effective_scale_a,
+                           int32_t input_effective_scale_b,
+                           int32_t recurrent_effective_scale_a,
+                           int32_t recurrent_effective_scale_b, int32_t n_batch,
+                           int32_t n_cell, int16_t* output);
+
 }  // namespace tensor_utils
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 5eaa0a9aebf..71df1b7468b 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -520,6 +520,102 @@ TEST(uKernels, QuantMatrixBatchVectorMultiplyAccumulate8x8_8Test) {
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }
 
+// Qautnized matmul with 2 * 30 input and 9 * 30 matrix with zero point.
+TEST(uKernels, QuantMatrixBatchVectorMultiply8x8_8WithZPTest) {
+  const int32_t input_zp = 3;
+  const std::vector<int8_t> input = {
+      4,   -41, 5,   -41, 22,  17, -30, 24,  13,  -47, 18, 9,   -11, -30, 16,
+      -47, 12,  36,  -20, 27,  -3, 0,   -51, -31, 3,   -8, -38, 43,  23,  12,
+      11,  -23, -26, 23,  14,  -9, -44, 22,  21,  -30, 3,  -47, -26, -21, -24,
+      -44, 34,  -11, -23, -28, 26, -38, 19,  35,  9,   23, 6,   -42, -25, 28,
+  };
+  const std::vector<int8_t> input_to_gate_weights = {
+      13,  -7,  -20, -22, 8,   -46, 9,   -2,  -18, -42, 40,  28,  -7,  24,  34,
+      -7,  -24, -24, 19,  14,  -19, -6,  -2,  -3,  5,   -36, -13, 6,   -27, 36,
+      -23, 0,   20,  -37, -23, 9,   17,  -41, 33,  -15, -18, -42, -41, -34, -16,
+      -6,  12,  -14, -15, -20, -14, 21,  -3,  -1,  -26, 54,  51,  35,  -14, 9,
+      -2,  13,  -6,  39,  34,  -21, 39,  -51, 19,  -44, 52,  0,   -2,  -38, -35,
+      -33, 4,   -22, -37, 27,  -23, 3,   -10, 5,   32,  6,   1,   -35, 24,  -19,
+      46,  43,  -55, 5,   38,  -14, 32,  -43, -44, -17, -13, -28, 56,  28,  -42,
+      4,   10,  -7,  25,  -15, -9,  -25, -14, -15, 6,   -10, -22, 40,  -72, 18,
+      -6,  -18, -2,  37,  -13, -10, 11,  -9,  32,  -28, 19,  -2,  4,   -31, 50,
+      -15, 23,  -34, -9,  41,  -6,  -34, 17,  2,   24,  -15, 21,  -17, -8,  -20,
+      1,   -63, 19,  -40, 12,  -5,  5,   -6,  1,   19,  -9,  -23, 5,   -34, 11,
+      26,  21,  54,  34,  -43, -29, 1,   16,  31,  -56, -28, 57,  -15, -23, 37,
+      -17, -3,  -6,  29,  18,  77,  17,  -20, -14, -19, 8,   -24, -7,  -45, -3,
+      0,   -25, -8,  6,   9,   3,   -15, 51,  4,   -15, -19, -16, -14, -47, -52,
+      25,  9,   58,  26,  -9,  -27, 49,  -6,  -21, 21,  18,  12,  -9,  -9,  14,
+      31,  -26, -19, -50, 17,  35,  11,  -10, 22,  -16, -43, -2,  26,  55,  -20,
+      -7,  21,  33,  -20, 26,  -15, -22, 30,  27,  3,   -34, 26,  12,  -1,  19,
+      26,  -25, 10,  30,  30,  -14, -23, -23, -35, -16, 26,  -41, 11,  1,   21,
+  };
+  const int32_t multiplier = 1347771520;
+  const int32_t shift = -7;
+  const int32_t output_zp = -11;
+
+  std::vector<int8_t> output = {1, 2, 3, 4, 5,  6,  5,  4,  3,
+                                2, 1, 2, 8, -1, -2, 11, 17, 18};
+
+  MatrixBatchVectorMultiply(
+      input.data(), input_zp, input_to_gate_weights.data(), multiplier, shift,
+      /*n_batch=*/2, /*n_input=*/30, /*n_cell=*/9, output.data(), output_zp);
+  const std::vector<int8_t> expected_output = {6,   -9,  -4, -32, -10, -17,
+                                               -25, -25, 14, -19, 3,   10,
+                                               -12, 10,  0,  1,   -57, -41};
+
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+}
+
+// Qautnized matmul with 2 * 30 input and 9 * 30 matrix with zero point.
+TEST(uKernels, QuantMatrixBatchVectorMultiply16x8_8WithZPTest) {
+  const std::vector<int16_t> input = {
+      400, -41, 5,   -41, 22,  17, -30, 24,  130, -47, 18, 9,   -11, -30, 16,
+      -47, 12,  36,  -20, 27,  -3, 0,   -51, -31, 3,   -8, -38, 43,  23,  12,
+      11,  -23, -26, 23,  14,  -9, -44, 22,  21,  -30, 3,  -47, -26, -21, -24,
+      -44, 34,  -11, -23, -28, 26, -38, 19,  35,  9,   23, 6,   -42, -25, 28,
+  };
+  const std::vector<int8_t> input_to_gate_weights = {
+      13,  -7,  -20, -22, 8,   -46, 9,   -2,  -18, -42, 40,  28,  -7,  24,  34,
+      -7,  -24, -24, 19,  14,  -19, -6,  -2,  -3,  5,   -36, -13, 6,   -27, 36,
+      -23, 0,   20,  -37, -23, 9,   17,  -41, 33,  -15, -18, -42, -41, -34, -16,
+      -6,  12,  -14, -15, -20, -14, 21,  -3,  -1,  -26, 54,  51,  35,  -14, 9,
+      -2,  13,  -6,  39,  34,  -21, 39,  -51, 19,  -44, 52,  0,   -2,  -38, -35,
+      -33, 4,   -22, -37, 27,  -23, 3,   -10, 5,   32,  6,   1,   -35, 24,  -19,
+      46,  43,  -55, 5,   38,  -14, 32,  -43, -44, -17, -13, -28, 56,  28,  -42,
+      4,   10,  -7,  25,  -15, -9,  -25, -14, -15, 6,   -10, -22, 40,  -72, 18,
+      -6,  -18, -2,  37,  -13, -10, 11,  -9,  32,  -28, 19,  -2,  4,   -31, 50,
+      -15, 23,  -34, -9,  41,  -6,  -34, 17,  2,   24,  -15, 21,  -17, -8,  -20,
+      1,   -63, 19,  -40, 12,  -5,  5,   -6,  1,   19,  -9,  -23, 5,   -34, 11,
+      26,  21,  54,  34,  -43, -29, 1,   16,  31,  -56, -28, 57,  -15, -23, 37,
+      -17, -3,  -6,  29,  18,  77,  17,  -20, -14, -19, 8,   -24, -7,  -45, -3,
+      0,   -25, -8,  6,   9,   3,   -15, 51,  4,   -15, -19, -16, -14, -47, -52,
+      25,  9,   58,  26,  -9,  -27, 49,  -6,  -21, 21,  18,  12,  -9,  -9,  14,
+      31,  -26, -19, -50, 17,  35,  11,  -10, 22,  -16, -43, -2,  26,  55,  -20,
+      -7,  21,  33,  -20, 26,  -15, -22, 30,  27,  3,   -34, 26,  12,  -1,  19,
+      26,  -25, 10,  30,  30,  -14, -23, -23, -35, -16, 26,  -41, 11,  1,   21,
+  };
+
+  const std::vector<int32_t> input_zeropoint_times_weights = {
+      0, 2, 3, 4, 5, 4, 3, 2, 10,
+  };
+  const int32_t multiplier = 1347771520;
+  const int32_t shift = -8;
+  const int32_t output_zp = -11;
+
+  std::vector<int8_t> output = {1, 2, 3, 4, 5,  6,  5,  4,  3,
+                                2, 1, 2, 8, -1, -2, 11, 17, 18};
+
+  MatrixBatchVectorMultiply(
+      input.data(), input_to_gate_weights.data(), multiplier, shift,
+      input_zeropoint_times_weights.data(),
+      /*n_batch=*/2, /*n_hidden=*/30, /*n_output=*/9, output_zp, output.data());
+  const std::vector<int8_t> expected_output = {4,   -24, -5, 10,  -7,  -13,
+                                               -39, 2,   3,  -16, -5,  -1,
+                                               -12, -1,  -6, -6,  -33, -25};
+
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+}
+
 // Quantized matmul with 9 * 30 matrix.
 TEST(uKernels, MatrixScalarMultiplyAccumulateTest) {
   std::vector<int32_t> output = {
@@ -585,6 +681,37 @@ TEST(uKernels, QuantApplyLayerNormTest) {
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }
 
+// Quantized layer norm of n_batch = 2 and n_input = 15.
+TEST(uKernels, QuantApplyLayerNormFloatTest) {
+  const std::vector<int16_t> input = {
+      -310,  596,   34,   -68,  475,  92,  672, -54,  -913, -200,
+      -1194, -836,  -620, -237, 991,  533, 721, -736, -8,   -941,
+      -372,  -1084, 591,  2557, -779, 175, 582, 956,  -287, 944,
+  };
+  const std::vector<int16_t> layer_norm_weights = {
+      21849, 22882, 20626, 23854, 24779, 26354, 12980, 26231,
+      23716, 27271, 24937, 22647, 24715, 22854, 19646,
+  };
+  const std::vector<int32_t> bias_weight = {
+      -14175520, -13805465, -16027609, -13786809, -13321033,
+      -14399810, -15055368, -14536623, -14508746, -13784007,
+      -15206609, -15125830, -14996304, -14847597, -12814379,
+  };
+  const int32_t multiplier = 1895840000;
+  const int32_t shift = -13;
+
+  std::vector<int16_t> output(2 * 15, 0);
+  ApplyLayerNormFloat(input.data(), layer_norm_weights.data(), multiplier,
+                      shift, bias_weight.data(), 2, 15, output.data());
+  const std::vector<int16_t> expected_output = {
+      -9408,  5844,   -4803,  -5297,  4826,   -2392,  927,   -5286,
+      -20353, -7851,  -26534, -18701, -15830, -8623,  10312, -2524,
+      -136,   -16053, -8206,  -19160, -13299, -14407, -1233, 20617,
+      -18594, -6736,  -2272,  2597,   -11620, 1566};
+
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+}
+
 // Quantized tanh with Q0.15 input and Q0.15 output.
 TEST(uKernels, QuantTanh0Test) {
   const std::vector<int16_t> input = {
@@ -631,6 +758,29 @@ TEST(uKernels, QuantTanh3Test) {
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }
 
+// Quantized tanh with float calculation.
+TEST(uKernels, QuantTanhFloatTest) {
+  const std::vector<int16_t> input = {
+      -1,   0,   1,    -35,  264, 289,  8,    27,   -37,  -1310,
+      -120, 127, -16,  106,  370, -583, -299, 93,   -548, 548,
+      653,  -29, -53,  1058, -52, -164, -149, -635, 201,  -1297,
+      -145, 899, -176, -35,  264, 289,  8,    27,   -37,  -1310,
+      -120, 127, -16,  106,  370, -583, -299, 93,   -548, 548,
+      653,  -29, -53,  1058, -52, -164, -149, -635, 201,  -1297,
+  };
+  std::vector<int16_t> output(4 * 15, 0);
+  ApplyTanhFloat(input.data(), 4, 15, -12, output.data());
+  const std::vector<int16_t> expected_output = {
+      -8,    0,    8,     -279, 2109, 2308,  63,    215,   -295,  -10136,
+      -959,  1015, -127,  847,  2951, -4632, -2387, 743,   -4358, 4358,
+      5180,  -231, -423,  8280, -415, -1311, -1191, -5039, 1606,  -10042,
+      -1159, 7078, -1407, -279, 2109, 2308,  63,    215,   -295,  -10136,
+      -959,  1015, -127,  847,  2951, -4632, -2387, 743,   -4358, 4358,
+      5180,  -231, -423,  8280, -415, -1311, -1191, -5039, 1606,  -10042};
+
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+}
+
 // Quantized tanh with Q4.11 input and Q0.15 output.
 TEST(uKernels, QuantTanh4Test) {
   const std::vector<int16_t> input = {
@@ -676,6 +826,30 @@ TEST(uKernels, QuantSigmoidTest) {
   EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
 }
 
+// Quantized sigmoid with Q3.12 input and Q0.15 output.
+TEST(uKernels, QuantSigmoidFloatTest) {
+  const std::vector<int16_t> input = {
+      -10500, 1398,   -6963,  -7404,  485,    -5401,  -1757,  -7668,  -19248,
+      -9692,  -24249, -17923, -15840, -10026, 5249,   -89,    1787,   -16178,
+      -6691,  -19524, -13439, -24048, -1123,  32767,  -17267, -3378,  823,
+      11482,  -11139, 7508,   -10500, 1398,   -6963,  -7404,  485,    -5401,
+      -1757,  -7668,  -19248, -9692,  -24249, -17923, -15840, -10026, 5249,
+      -89,    1787,   -16178, -6691,  -19524, -13439, -24048, -1123,  32767,
+      -17267, -3378,  823,    11482,  -11139, 7508,
+  };
+  std::vector<int16_t> output(4 * 15, 0);
+  ApplySigmoidFloat(input.data(), 4, 15, output.data());
+  const std::vector<int16_t> expected_output = {
+      2343, 19153, 5061,  4617,  17352, 6915,  12922, 4368,  295,  2811,
+      87,   407,   671,   2608,  25647, 16206, 19902, 619,   5352, 276,
+      1187, 92,    14151, 32757, 476,   9986,  18024, 30895, 2026, 28249,
+      2343, 19153, 5061,  4617,  17352, 6915,  12922, 4368,  295,  2811,
+      87,   407,   671,   2608,  25647, 16206, 19902, 619,   5352, 276,
+      1187, 92,    14151, 32757, 476,   9986,  18024, 30895, 2026, 28249};
+
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+}
+
 // Quantized Multiply with 16bit output and 15 bit shift.
 TEST(uKernels, QuantMul16bitOut15ShiftTest) {
   const std::vector<int16_t> input1 = {
@@ -1745,6 +1919,33 @@ TEST(uKernels, ReductionSumVectorIntegerTest) {
   EXPECT_THAT(result1, testing::ElementsAreArray({3, 6, -1, 3, 15}));
 }
 
+void TwoGateSaturationgAdd(const int8_t* input, int8_t input_zp,
+                           const int8_t* recurrent, int8_t recurrent_zp,
+                           int32_t input_effective_scale_a,
+                           int32_t input_effective_scale_b,
+                           int32_t recurrent_effective_scale_a,
+                           int32_t recurrent_effective_scale_b, int32_t n_batch,
+                           int32_t n_cell, int16_t* output);
+
+TEST(uKernels, TwoGateSaturateAddTest) {
+  const std::vector<int8_t> input1 = {1, 2, 3, 4, 55, 66, 77};
+  const std::vector<int8_t> input2 = {100, 2, 3, 4, 55, 66, 77};
+  const int32_t input1_zp = 10;
+  const int32_t input2_zp = -5;
+  const int32_t multiplier1 = 1347771520;
+  const int32_t shift1 = -7;
+  const int32_t multiplier2 = 1047577121;
+  const int32_t shift2 = -6;
+  std::vector<int16_t> output(7);
+
+  TwoGateSaturationgAdd(input1.data(), input1_zp, input2.data(), input2_zp,
+                        multiplier1, shift1, multiplier2, shift2, 1, 7,
+                        output.data());
+
+  const std::vector<int16_t> expected_output = {1, 0, 0, 0, 0, 1, 1};
+  EXPECT_THAT(output, testing::ElementsAreArray(expected_output));
+}
+
 namespace {
 // Parameterized test: mean, difference, tolerance.
 // Input is constructed as [mean-2*diff, mean-diff, mean+diff, mean+2*diff]
diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc
index 217cf5cdee0..fceea866fca 100644
--- a/tensorflow/lite/kernels/lstm.cc
+++ b/tensorflow/lite/kernels/lstm.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -59,7 +59,8 @@ struct OpData {
 
 namespace full {
 namespace {
-TfLiteStatus PopulateQuantizedLstmParams(
+
+TfLiteStatus PopulateQuantizedLstmParams8x8_16(
     TfLiteContext* context, TfLiteNode* node,
     lstm_eval::IntegerLstmParameter* integer_lstm_param) {
   // Calculate quantized clip for projection and cell.
@@ -366,6 +367,361 @@ TfLiteStatus PopulateQuantizedLstmParams(
   return kTfLiteOk;
 }
 
+TfLiteStatus PopulateQuantizedLstmParams8x8_8(
+    TfLiteContext* context, TfLiteNode* node,
+    lstm_eval::IntegerLstmParameter* integer_lstm_param) {
+  // Get all tensors.
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  const TfLiteTensor* input_to_forget_weights =
+      GetInput(context, node, kInputToForgetWeightsTensor);
+  const TfLiteTensor* input_to_cell_weights =
+      GetInput(context, node, kInputToCellWeightsTensor);
+  const TfLiteTensor* input_to_output_weights =
+      GetInput(context, node, kInputToOutputWeightsTensor);
+
+  const TfLiteTensor* recurrent_to_input_weights =
+      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
+  const TfLiteTensor* recurrent_to_forget_weights =
+      GetInput(context, node, kRecurrentToForgetWeightsTensor);
+  const TfLiteTensor* recurrent_to_cell_weights =
+      GetInput(context, node, kRecurrentToCellWeightsTensor);
+  const TfLiteTensor* recurrent_to_output_weights =
+      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+
+  const TfLiteTensor* cell_to_input_weights =
+      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
+  const TfLiteTensor* cell_to_forget_weights =
+      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
+  const TfLiteTensor* cell_to_output_weights =
+      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
+
+  const TfLiteTensor* input_layer_norm_coefficients =
+      GetOptionalInputTensor(context, node, kInputLayerNormCoefficientsTensor);
+  const TfLiteTensor* forget_layer_norm_coefficients =
+      GetOptionalInputTensor(context, node, kForgetLayerNormCoefficientsTensor);
+  const TfLiteTensor* cell_layer_norm_coefficients =
+      GetOptionalInputTensor(context, node, kCellLayerNormCoefficientsTensor);
+  const TfLiteTensor* output_layer_norm_coefficients =
+      GetOptionalInputTensor(context, node, kOutputLayerNormCoefficientsTensor);
+
+  const TfLiteTensor* input_gate_bias =
+      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
+  const TfLiteTensor* forget_gate_bias =
+      GetInput(context, node, kForgetGateBiasTensor);
+  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  const TfLiteTensor* output_gate_bias =
+      GetInput(context, node, kOutputGateBiasTensor);
+
+  const TfLiteTensor* projection_weights =
+      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
+  const TfLiteTensor* projection_bias =
+      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
+
+  TfLiteTensor* activation_state =
+      GetVariableInput(context, node, kInputActivationStateTensor);
+  TF_LITE_ENSURE(context, activation_state != nullptr);
+  TfLiteTensor* cell_state =
+      GetVariableInput(context, node, kInputCellStateTensor);
+  TF_LITE_ENSURE(context, cell_state != nullptr);
+
+  // Since we have already checked that weights are all there or none, we can
+  // check the existence of only one to get the condition.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool use_peephole = (cell_to_output_weights != nullptr);
+  const bool is_layer_norm_lstm = (forget_layer_norm_coefficients != nullptr);
+  const bool use_projection = (projection_weights != nullptr);
+
+  // Weights and states.
+  int8_t* input_to_input_weight_ptr = nullptr;
+  int8_t* recurrent_to_input_weight_ptr = nullptr;
+  int8_t* cell_to_input_weight_ptr = nullptr;
+  int8_t* input_to_forget_weight_ptr = nullptr;
+  int8_t* recurrent_to_forget_weight_ptr = nullptr;
+  int8_t* cell_to_forget_weight_ptr = nullptr;
+  int8_t* input_to_cell_weight_ptr = nullptr;
+  int8_t* recurrent_to_cell_weight_ptr = nullptr;
+  int8_t* input_to_output_weight_ptr = nullptr;
+  int8_t* recurrent_to_output_weight_ptr = nullptr;
+  int8_t* cell_to_output_weight_ptr = nullptr;
+  int8_t* proj_weight_ptr = nullptr;
+  int16_t* layer_norm_input_weight_ptr = nullptr;
+  int16_t* layer_norm_forget_weight_ptr = nullptr;
+  int16_t* layer_norm_cell_weight_ptr = nullptr;
+  int16_t* layer_norm_output_weight_ptr = nullptr;
+  int32_t* input_bias_ptr = nullptr;
+  int32_t* forget_bias_ptr = nullptr;
+  int32_t* cell_bias_ptr = nullptr;
+  int32_t* output_bias_ptr = nullptr;
+  int32_t* proj_bias_ptr = nullptr;
+  int16_t* cell_ptr = nullptr;
+  int8_t* activation_ptr = nullptr;
+
+  // Scales.
+  const float default_scale = 1.0;
+  float input_scale = default_scale;
+  float input_to_input_weight_scale = default_scale;
+  float recurrent_to_input_weight_scale = default_scale;
+  float cell_to_input_weight_scale = default_scale;
+  float input_to_forget_weight_scale = default_scale;
+  float recurrent_to_forget_weight_scale = default_scale;
+  float cell_to_forget_weight_scale = default_scale;
+  float input_to_cell_weight_scale = default_scale;
+  float recurrent_to_cell_weight_scale = default_scale;
+  float input_to_output_weight_scale = default_scale;
+  float recurrent_to_output_weight_scale = default_scale;
+  float cell_to_output_weight_scale = default_scale;
+  float proj_weight_scale = default_scale;
+  float layer_norm_input_scale = default_scale;
+  float layer_norm_forget_scale = default_scale;
+  float layer_norm_cell_scale = default_scale;
+  float layer_norm_output_scale = default_scale;
+  float activation_scale = default_scale;
+
+  // Effective scales.
+  float effective_input_to_input_scale = default_scale;
+  float effective_recurrent_to_input_scale = default_scale;
+  float effective_cell_to_input_scale = default_scale;
+  float effective_input_to_forget_scale = default_scale;
+  float effective_recurrent_to_forget_scale = default_scale;
+  float effective_cell_to_forget_scale = default_scale;
+  float effective_input_to_cell_scale = default_scale;
+  float effective_recurrent_to_cell_scale = default_scale;
+  float effective_input_to_output_scale = default_scale;
+  float effective_recurrent_to_output_scale = default_scale;
+  float effective_cell_to_output_scale = default_scale;
+  float effective_proj_scale = default_scale;
+
+  // Zero points
+  int input_zp = 0;
+  int activation_zp = 0;
+
+  // Populate all the values.
+  if (!use_cifg) {
+    input_to_input_weight_ptr = input_to_input_weights->data.int8;
+    recurrent_to_input_weight_ptr = recurrent_to_input_weights->data.int8;
+    input_bias_ptr = input_gate_bias->data.i32;
+    input_to_input_weight_scale = input_to_input_weights->params.scale;
+    recurrent_to_input_weight_scale = recurrent_to_input_weights->params.scale;
+  }
+
+  if (use_peephole) {
+    if (!use_cifg) {
+      cell_to_input_weight_ptr = cell_to_input_weights->data.int8;
+      cell_to_input_weight_scale = cell_to_input_weights->params.scale;
+    }
+    cell_to_forget_weight_ptr = cell_to_forget_weights->data.int8;
+    cell_to_output_weight_ptr = cell_to_output_weights->data.int8;
+    cell_to_forget_weight_scale = cell_to_forget_weights->params.scale;
+    cell_to_output_weight_scale = cell_to_output_weights->params.scale;
+  }
+
+  if (is_layer_norm_lstm) {
+    if (!use_cifg) {
+      layer_norm_input_weight_ptr = input_layer_norm_coefficients->data.i16;
+      layer_norm_input_scale = input_layer_norm_coefficients->params.scale;
+    }
+    layer_norm_forget_weight_ptr = forget_layer_norm_coefficients->data.i16;
+    layer_norm_forget_scale = forget_layer_norm_coefficients->params.scale;
+    layer_norm_cell_weight_ptr = cell_layer_norm_coefficients->data.i16;
+    layer_norm_cell_scale = cell_layer_norm_coefficients->params.scale;
+    layer_norm_output_weight_ptr = output_layer_norm_coefficients->data.i16;
+    layer_norm_output_scale = output_layer_norm_coefficients->params.scale;
+  }
+
+  if (use_projection) {
+    proj_weight_ptr = projection_weights->data.int8;
+    proj_weight_scale = projection_weights->params.scale;
+    if (projection_bias) {
+      proj_bias_ptr = projection_bias->data.i32;
+    }
+  }
+  activation_scale = activation_state->params.scale;
+
+  input_to_forget_weight_ptr = input_to_forget_weights->data.int8;
+  input_to_forget_weight_scale = input_to_forget_weights->params.scale;
+  input_to_cell_weight_ptr = input_to_cell_weights->data.int8;
+  input_to_cell_weight_scale = input_to_cell_weights->params.scale;
+  input_to_output_weight_ptr = input_to_output_weights->data.int8;
+  input_to_output_weight_scale = input_to_output_weights->params.scale;
+  recurrent_to_forget_weight_ptr = recurrent_to_forget_weights->data.int8;
+  recurrent_to_forget_weight_scale = recurrent_to_forget_weights->params.scale;
+  recurrent_to_cell_weight_ptr = recurrent_to_cell_weights->data.int8;
+  recurrent_to_cell_weight_scale = recurrent_to_cell_weights->params.scale;
+  recurrent_to_output_weight_ptr = recurrent_to_output_weights->data.int8;
+  recurrent_to_output_weight_scale = recurrent_to_output_weights->params.scale;
+  forget_bias_ptr = forget_gate_bias->data.i32;
+  cell_bias_ptr = cell_bias->data.i32;
+  output_bias_ptr = output_gate_bias->data.i32;
+  activation_ptr = activation_state->data.int8;
+  cell_ptr = cell_state->data.i16;
+  input_scale = input->params.scale;
+  input_zp = input->params.zero_point;
+  activation_zp = activation_state->params.zero_point;
+
+  std::vector<float> intermediate_scale;
+  for (int i = 0; i < 12; ++i) {
+    TfLiteTensor* intermediate =
+        &context->tensors[node->intermediates->data[i]];
+    auto* params = reinterpret_cast<TfLiteAffineQuantization*>(
+        intermediate->quantization.params);
+    intermediate_scale.push_back(params->scale->data[0]);
+    integer_lstm_param->intermediate_zp[i] = params->zero_point->data[0];
+  }
+
+  // Calculate effective scales.
+  if (!use_cifg) {
+    effective_input_to_input_scale =
+        input_to_input_weight_scale * input_scale / intermediate_scale[1];
+    effective_recurrent_to_input_scale = recurrent_to_input_weight_scale *
+                                         activation_scale /
+                                         intermediate_scale[2];
+  }
+  effective_input_to_forget_scale =
+      input_to_forget_weight_scale * input_scale / intermediate_scale[4];
+  effective_recurrent_to_forget_scale = recurrent_to_forget_weight_scale *
+                                        activation_scale /
+                                        intermediate_scale[5];
+
+  effective_input_to_cell_scale =
+      input_to_cell_weight_scale * input_scale / intermediate_scale[7];
+  effective_recurrent_to_cell_scale =
+      recurrent_to_cell_weight_scale * activation_scale / intermediate_scale[8];
+
+  effective_input_to_output_scale =
+      input_to_output_weight_scale * input_scale / intermediate_scale[10];
+  effective_recurrent_to_output_scale = recurrent_to_output_weight_scale *
+                                        activation_scale /
+                                        intermediate_scale[11];
+  effective_proj_scale =
+      proj_weight_scale * std::pow(2, -15) / activation_scale;
+
+  if (use_peephole) {
+    if (!use_cifg) {
+      effective_cell_to_input_scale =
+          std::pow(2, -15) * cell_to_input_weight_scale / intermediate_scale[0];
+    }
+    effective_cell_to_forget_scale =
+        std::pow(2, -15) * cell_to_forget_weight_scale / intermediate_scale[3];
+    effective_cell_to_output_scale =
+        std::pow(2, -15) * cell_to_output_weight_scale / intermediate_scale[9];
+  }
+
+  // Calculate effecgive scales.
+  QuantizeMultiplier(effective_input_to_input_scale,
+                     &integer_lstm_param->effective_input_to_input_scale_a,
+                     &integer_lstm_param->effective_input_to_input_scale_b);
+  QuantizeMultiplier(effective_recurrent_to_input_scale,
+                     &integer_lstm_param->effective_recurrent_to_input_scale_a,
+                     &integer_lstm_param->effective_recurrent_to_input_scale_b);
+  QuantizeMultiplier(effective_cell_to_input_scale,
+                     &integer_lstm_param->effective_cell_to_input_scale_a,
+                     &integer_lstm_param->effective_cell_to_input_scale_b);
+  QuantizeMultiplier(effective_input_to_forget_scale,
+                     &integer_lstm_param->effective_input_to_forget_scale_a,
+                     &integer_lstm_param->effective_input_to_forget_scale_b);
+  QuantizeMultiplier(
+      effective_recurrent_to_forget_scale,
+      &integer_lstm_param->effective_recurrent_to_forget_scale_a,
+      &integer_lstm_param->effective_recurrent_to_forget_scale_b);
+  QuantizeMultiplier(effective_cell_to_forget_scale,
+                     &integer_lstm_param->effective_cell_to_forget_scale_a,
+                     &integer_lstm_param->effective_cell_to_forget_scale_b);
+  QuantizeMultiplier(effective_input_to_cell_scale,
+                     &integer_lstm_param->effective_input_to_cell_scale_a,
+                     &integer_lstm_param->effective_input_to_cell_scale_b);
+  QuantizeMultiplier(effective_recurrent_to_cell_scale,
+                     &integer_lstm_param->effective_recurrent_to_cell_scale_a,
+                     &integer_lstm_param->effective_recurrent_to_cell_scale_b);
+  QuantizeMultiplier(effective_input_to_output_scale,
+                     &integer_lstm_param->effective_input_to_output_scale_a,
+                     &integer_lstm_param->effective_input_to_output_scale_b);
+  QuantizeMultiplier(
+      effective_recurrent_to_output_scale,
+      &integer_lstm_param->effective_recurrent_to_output_scale_a,
+      &integer_lstm_param->effective_recurrent_to_output_scale_b);
+  QuantizeMultiplier(effective_cell_to_output_scale,
+                     &integer_lstm_param->effective_cell_to_output_scale_a,
+                     &integer_lstm_param->effective_cell_to_output_scale_b);
+  QuantizeMultiplier(effective_proj_scale,
+                     &integer_lstm_param->effective_proj_scale_a,
+                     &integer_lstm_param->effective_proj_scale_b);
+  QuantizeMultiplier(layer_norm_input_scale,
+                     &integer_lstm_param->layer_norm_input_scale_a,
+                     &integer_lstm_param->layer_norm_input_scale_b);
+  QuantizeMultiplier(layer_norm_forget_scale,
+                     &integer_lstm_param->layer_norm_forget_scale_a,
+                     &integer_lstm_param->layer_norm_forget_scale_b);
+  QuantizeMultiplier(layer_norm_cell_scale,
+                     &integer_lstm_param->layer_norm_cell_scale_a,
+                     &integer_lstm_param->layer_norm_cell_scale_b);
+  QuantizeMultiplier(layer_norm_output_scale,
+                     &integer_lstm_param->layer_norm_output_scale_a,
+                     &integer_lstm_param->layer_norm_output_scale_b);
+
+  {
+    // Intermdiates in flatbuffer holds Wx, Wh and Wx+Wh.
+    // effective Wx, Wh is in effective_input/recurrent_to_<...>_scale
+    // So use intermediate_scale to hold scale from Wx and Wh to Wx+Wh
+    // 0: [1] -> [0]
+    // 1: [2] -> [0]
+    // and use intermdiate_zp as is.
+    const float s_1_0 = intermediate_scale[1] / intermediate_scale[0];
+    const float s_2_0 = intermediate_scale[2] / intermediate_scale[0];
+    const float s_4_3 = intermediate_scale[4] / intermediate_scale[3];
+    const float s_5_3 = intermediate_scale[5] / intermediate_scale[3];
+    const float s_7_6 = intermediate_scale[7] / intermediate_scale[6];
+    const float s_8_6 = intermediate_scale[8] / intermediate_scale[6];
+    const float s_10_9 = intermediate_scale[10] / intermediate_scale[9];
+    const float s_11_9 = intermediate_scale[11] / intermediate_scale[9];
+    QuantizeMultiplier(s_1_0, &integer_lstm_param->intermediate_scale_a[0],
+                       &integer_lstm_param->intermediate_scale_b[0]);
+    QuantizeMultiplier(s_2_0, &integer_lstm_param->intermediate_scale_a[1],
+                       &integer_lstm_param->intermediate_scale_b[1]);
+    QuantizeMultiplier(s_4_3, &integer_lstm_param->intermediate_scale_a[2],
+                       &integer_lstm_param->intermediate_scale_b[2]);
+    QuantizeMultiplier(s_5_3, &integer_lstm_param->intermediate_scale_a[3],
+                       &integer_lstm_param->intermediate_scale_b[3]);
+    QuantizeMultiplier(s_7_6, &integer_lstm_param->intermediate_scale_a[4],
+                       &integer_lstm_param->intermediate_scale_b[4]);
+    QuantizeMultiplier(s_8_6, &integer_lstm_param->intermediate_scale_a[5],
+                       &integer_lstm_param->intermediate_scale_b[5]);
+    QuantizeMultiplier(s_10_9, &integer_lstm_param->intermediate_scale_a[6],
+                       &integer_lstm_param->intermediate_scale_b[6]);
+    QuantizeMultiplier(s_11_9, &integer_lstm_param->intermediate_scale_a[7],
+                       &integer_lstm_param->intermediate_scale_b[7]);
+  }
+
+  // Calculate quantized clip for projection and cell.
+  const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+  const float cell_clip = params->cell_clip;
+  const float proj_clip = params->proj_clip;
+
+  const TfLiteTensor* cell_tensor =
+      GetInput(context, node, kInputCellStateTensor);
+  const TfLiteTensor* output_tensor = GetOutput(context, node, kOutputTensor);
+
+  auto* cell_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      cell_tensor->quantization.params);
+  auto* proj_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      output_tensor->quantization.params);
+  TF_LITE_ENSURE_EQ(context, cell_params->scale->data[0], 1.0 / 32768);
+  if (cell_clip > 0.0 && cell_clip < 1.0) {
+    integer_lstm_param->quantized_cell_clip =
+        static_cast<int>(cell_clip / cell_params->scale->data[0]);
+  } else {
+    integer_lstm_param->quantized_cell_clip = 0;
+  }
+  if (proj_clip > 0.0) {
+    integer_lstm_param->quantized_proj_clip =
+        proj_clip / proj_params->scale->data[0];
+  } else {
+    integer_lstm_param->quantized_proj_clip = 0;
+  }
+  return kTfLiteOk;
+}
+
 }  // namespace
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
@@ -868,11 +1224,25 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // The weights are of consistent type, so it suffices to check one.
   const bool is_hybrid_op = IsHybridOp(input, input_to_output_weights);
 
+  // The type of Integer LSTM.
+  const int num_intermediate_tensors = node->intermediates->size;
+  if (is_integer) {
+    TF_LITE_ENSURE(context, num_intermediate_tensors == 5 ||
+                                num_intermediate_tensors == 12);
+  }
+  // We use number of intermediate tensors to distinguish the 8 bit matmul
+  // output and the 16 bit matmul output version.
+  const bool is_8x8_16 = num_intermediate_tensors == 5;
+
   TfLiteIntArrayFree(node->temporaries);
   if (is_hybrid_op) {
     node->temporaries = TfLiteIntArrayCreate(8);
   } else if (is_integer) {
-    node->temporaries = TfLiteIntArrayCreate(6);
+    if (is_8x8_16) {
+      node->temporaries = TfLiteIntArrayCreate(6);
+    } else {
+      node->temporaries = TfLiteIntArrayCreate(8);
+    }
   } else {
     node->temporaries = TfLiteIntArrayCreate(1);
   }
@@ -1003,42 +1373,78 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   }
 
   if (is_integer) {
-    // Populate quantization parameters.
-    PopulateQuantizedLstmParams(context, node, &op_data->integer_lstm_param);
+    if (is_8x8_16) {
+      // Integer LSTM prepare function for 8x8->16.
+      // This code path needs 5 intermediate tensors per Op.
+      // Populate quantization parameters.
+      PopulateQuantizedLstmParams8x8_16(context, node,
+                                        &op_data->integer_lstm_param);
 
-    // Allocate scratch buffer. Need 6 16bit buffer with size n_batch * n_cell
-    // and 1 8bit buffer with size n_batch * n_cell. We also need 1 32 bit
-    // buffer with size n_batch * n_cell.
-    //
-    // TODO(jianlijianli): Handle cifg case as well, which might save one
-    // buffer.
-    for (int scratch_index = 0; scratch_index < 6; ++scratch_index) {
-      node->temporaries->data[scratch_index] =
-          op_data->scratch_tensor_index + scratch_index;
-      TfLiteTensor* scratch_tensor =
-          GetTemporary(context, node, /*index=*/scratch_index);
-      scratch_tensor->type = kTfLiteInt16;
-      if (scratch_index == 4) {
-        scratch_tensor->type = kTfLiteInt8;
-      } else if (scratch_index == 5) {
-        scratch_tensor->type = kTfLiteInt32;
+      // Allocate scratch buffer. Need 6 16bit buffer with size n_batch * n_cell
+      // and 1 8bit buffer with size n_batch * n_cell. We also need 1 32 bit
+      // buffer with size n_batch * n_cell.
+      //
+      // Handle cifg case as well, which might save one buffer.
+      for (int scratch_index = 0; scratch_index < 6; ++scratch_index) {
+        node->temporaries->data[scratch_index] =
+            op_data->scratch_tensor_index + scratch_index;
+        TfLiteTensor* scratch_tensor =
+            GetTemporary(context, node, /*index=*/scratch_index);
+        scratch_tensor->type = kTfLiteInt16;
+        if (scratch_index == 4) {
+          scratch_tensor->type = kTfLiteInt8;
+        } else if (scratch_index == 5) {
+          scratch_tensor->type = kTfLiteInt32;
+        }
+        scratch_tensor->allocation_type = kTfLiteArenaRw;
+        const int scratch_dimension[2] = {n_batch, n_cell};
+        if (!TfLiteIntArrayEqualsArray(scratch_tensor->dims, 2,
+                                       scratch_dimension)) {
+          TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
+          scratch_buffer_size->data[0] = n_batch;
+          scratch_buffer_size->data[1] = n_cell;
+          TF_LITE_ENSURE_OK(context,
+                            context->ResizeTensor(context, scratch_tensor,
+                                                  scratch_buffer_size));
+        }
       }
-      scratch_tensor->allocation_type = kTfLiteArenaRw;
-      const int scratch_dimension[2] = {n_batch, n_cell};
-      if (!TfLiteIntArrayEqualsArray(scratch_tensor->dims, 2,
-                                     scratch_dimension)) {
-        TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
-        scratch_buffer_size->data[0] = n_batch;
-        scratch_buffer_size->data[1] = n_cell;
-        TF_LITE_ENSURE_OK(context,
-                          context->ResizeTensor(context, scratch_tensor,
-                                                scratch_buffer_size));
+
+      // Populate precomputed zp * weight.
+      TF_LITE_ENSURE_OK(context, PopulatePrecomputedZPTimesWeightsWithBias(
+                                     context, op_data, node));
+    } else {
+      // Integer LSTM prepare function for 8x8->8.
+      // This code path needs 12 intermediate tensors per Op.
+      PopulateQuantizedLstmParams8x8_8(context, node,
+                                       &op_data->integer_lstm_param);
+
+      // Allocate scratch buffer. Need 6 16bit buffer with size n_batch * n_cell
+      // and 2 8bit buffer with size n_batch * n_cell.
+      //
+      // Handle cifg case as well, which might save one buffer.
+      for (int scratch_index = 0; scratch_index < 8; ++scratch_index) {
+        node->temporaries->data[scratch_index] =
+            op_data->scratch_tensor_index + scratch_index;
+        TfLiteTensor* scratch_tensor =
+            GetTemporary(context, node, /*index=*/scratch_index);
+        if (scratch_index == 0 || scratch_index == 1) {
+          scratch_tensor->type = kTfLiteInt8;
+        } else {
+          scratch_tensor->type = kTfLiteInt16;
+        }
+        scratch_tensor->allocation_type = kTfLiteArenaRw;
+        const int scratch_dimension[2] = {n_batch, n_cell};
+        if (!TfLiteIntArrayEqualsArray(scratch_tensor->dims, 2,
+                                       scratch_dimension)) {
+          TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
+          scratch_buffer_size->data[0] = n_batch;
+          scratch_buffer_size->data[1] = n_cell;
+          TF_LITE_ENSURE_OK(context,
+                            context->ResizeTensor(context, scratch_tensor,
+                                                  scratch_buffer_size));
+        }
       }
     }
-
-    // Populate precomputed zp * weight.
-    TF_LITE_ENSURE_OK(context, PopulatePrecomputedZPTimesWeightsWithBias(
-                                   context, op_data, node));
   }
   return kTfLiteOk;
 }
@@ -1174,26 +1580,51 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
             output_scratch_buffer, output,
             CpuBackendContext::GetFromContext(context));
       } else {
-        TfLiteTensor* scratch0 = GetTemporary(context, node, /*index=*/0);
-        TfLiteTensor* scratch1 = GetTemporary(context, node, /*index=*/1);
-        TfLiteTensor* scratch2 = GetTemporary(context, node, /*index=*/2);
-        TfLiteTensor* scratch3 = GetTemporary(context, node, /*index=*/3);
-        TfLiteTensor* scratch4 = GetTemporary(context, node, /*index=*/4);
-        TfLiteTensor* scratch5 = GetTemporary(context, node, /*index=*/5);
-        return lstm_eval::EvalInteger8x8_16(
-            input, input_to_input_weights, input_to_forget_weights,
-            input_to_cell_weights, input_to_output_weights,
-            recurrent_to_input_weights, recurrent_to_forget_weights,
-            recurrent_to_cell_weights, recurrent_to_output_weights,
-            cell_to_input_weights, cell_to_forget_weights,
-            cell_to_output_weights, input_layer_norm_coefficients,
-            forget_layer_norm_coefficients, cell_layer_norm_coefficients,
-            output_layer_norm_coefficients, input_gate_bias, forget_gate_bias,
-            cell_bias, output_gate_bias, projection_weights, projection_bias,
-            params, &op_data->integer_lstm_param, activation_state, cell_state,
-            output, scratch0, scratch1, scratch2, scratch3, scratch4, scratch5,
-            CpuBackendContext::GetFromContext(context));
-        return kTfLiteOk;
+        const int num_intermediate_tensors = node->intermediates->size;
+        if (num_intermediate_tensors == 5) {
+          TfLiteTensor* scratch0 = GetTemporary(context, node, /*index=*/0);
+          TfLiteTensor* scratch1 = GetTemporary(context, node, /*index=*/1);
+          TfLiteTensor* scratch2 = GetTemporary(context, node, /*index=*/2);
+          TfLiteTensor* scratch3 = GetTemporary(context, node, /*index=*/3);
+          TfLiteTensor* scratch4 = GetTemporary(context, node, /*index=*/4);
+          TfLiteTensor* scratch5 = GetTemporary(context, node, /*index=*/5);
+          return lstm_eval::EvalInteger8x8_16(
+              input, input_to_input_weights, input_to_forget_weights,
+              input_to_cell_weights, input_to_output_weights,
+              recurrent_to_input_weights, recurrent_to_forget_weights,
+              recurrent_to_cell_weights, recurrent_to_output_weights,
+              cell_to_input_weights, cell_to_forget_weights,
+              cell_to_output_weights, input_layer_norm_coefficients,
+              forget_layer_norm_coefficients, cell_layer_norm_coefficients,
+              output_layer_norm_coefficients, input_gate_bias, forget_gate_bias,
+              cell_bias, output_gate_bias, projection_weights, projection_bias,
+              params, &op_data->integer_lstm_param, activation_state,
+              cell_state, output, scratch0, scratch1, scratch2, scratch3,
+              scratch4, scratch5, CpuBackendContext::GetFromContext(context));
+        } else {
+          TfLiteTensor* scratch0 = GetTemporary(context, node, /*index=*/0);
+          TfLiteTensor* scratch1 = GetTemporary(context, node, /*index=*/1);
+          TfLiteTensor* scratch2 = GetTemporary(context, node, /*index=*/2);
+          TfLiteTensor* scratch3 = GetTemporary(context, node, /*index=*/3);
+          TfLiteTensor* scratch4 = GetTemporary(context, node, /*index=*/4);
+          TfLiteTensor* scratch5 = GetTemporary(context, node, /*index=*/5);
+          TfLiteTensor* scratch6 = GetTemporary(context, node, /*index=*/6);
+          TfLiteTensor* scratch7 = GetTemporary(context, node, /*index=*/7);
+          return lstm_eval::EvalInteger8x8_8(
+              input, input_to_input_weights, input_to_forget_weights,
+              input_to_cell_weights, input_to_output_weights,
+              recurrent_to_input_weights, recurrent_to_forget_weights,
+              recurrent_to_cell_weights, recurrent_to_output_weights,
+              cell_to_input_weights, cell_to_forget_weights,
+              cell_to_output_weights, input_layer_norm_coefficients,
+              forget_layer_norm_coefficients, cell_layer_norm_coefficients,
+              output_layer_norm_coefficients, input_gate_bias, forget_gate_bias,
+              cell_bias, output_gate_bias, projection_weights, projection_bias,
+              params, activation_state, cell_state, output,
+              &op_data->integer_lstm_param, scratch0, scratch1, scratch2,
+              scratch3, scratch4, scratch5, scratch6, scratch7);
+          return kTfLiteOk;
+        }
       }
     }
     default:
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index 3670b25f5d7..5691a7df8a5 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -816,7 +816,7 @@ inline void LstmStepHybrid(
   }
 }
 
-// Fully quantized lstm kernel. Currently supports both cifg and non-cifg.
+// Fully quantized lstm kernel for 16 bit gate matmul output.
 //
 // Input activation of size n_batch * n_input:
 //   input_ptr
@@ -895,7 +895,7 @@ inline void LstmStepHybrid(
 //
 // Temporary pre-allocated storage for the calculation. Each is of size n_cell *
 // n_batch.
-//   scratch_0:
+//   scratch_0
 //   scratch_1
 //   scratch_2
 //   scratch_3
@@ -1142,6 +1142,272 @@ inline void LstmStepInteger(
   std::copy_n(output_ptr, n_batch * n_output, activation_ptr);
 }
 
+// Fully quantized lstm kernel for 8 bit gate matmul output.
+//
+// Input activation of size n_batch * n_input:
+//   input_ptr
+//
+// LSTM weights:
+// Quantized input weights of size 'n_cell * n_input':
+//   input_to_input_weight_ptr            - optional
+//   input_to_forget_weight_ptr           - optional
+//   input_to_cell_weight_ptr             - optional
+//   input_to_output_weight_ptr           - optional
+//
+// Quantized recurrent weights of size 'n_cell * n_output':
+//   recurrent_to_input_weight_ptr        - optional
+//   recurrent_to_forget_weights_ptr
+//   recurrent_to_cell_weights_ptr
+//   recurrent_to_input_weights_ptr
+//
+// Quantized peephole weights of size 'n_cell', representing diagonal matrices.
+//   cell_to_input_weights               - optional
+//   cell_to_cell_weights                - optional
+//   cell_to_output_weights              - optional
+//
+// Quantized projection weights of size 'n_output * n_cell'
+//   proj_weight_ptr                     - optional
+//
+// Weight scales (scalars) for each of the weights above.
+//   effective_input_to_input_scale_a    - optional
+//   effective_input_to_input_scale_b    - optional
+//   effective_input_to_forget_scale_a
+//   effective_input_to_forget_scale_b
+//   effective_input_to_cell_scale_a
+//   effective_input_to_cell_scale_b
+//   effective_input_to_output_scale_a
+//   effective_input_to_output_scale_b
+//   effective_recurrent_to_input_scale_a    - optional
+//   effective_recurrent_to_input_scale_b    - optional
+//   effective_recurrent_to_forget_scale_a
+//   effective_recurrent_to_forget_scale_b
+//   effective_recurrent_to_cell_scale_a
+//   effective_recurrent_to_cell_scale_b
+//   effective_recurrent_to_output_scale_a
+//   effective_recurrent_to_output_scale_b
+//   effective_proj_scale_a                  - optional
+//   effective_proj_scale_b                  - optional
+//
+// Gate biases of size 'n_cell':
+//   input_bias_ptr                 - optional
+//   forget_bias_ptr
+//   cell_bias_ptr
+//   output_bias_ptr
+//
+// Layer norm coefficients of size 'n_cell', representing diagonal matrices.
+//   layer_norm_input_weight_ptr    - optional
+//   layer_norm_forput_weight_ptr   - optional
+//   layer_norm_cell_weight_ptr     - optional
+//   layer_norm_output_weight_ptr   - optional
+//
+// Layer norm scales of size 'n_cell'.
+//   layer_norm_input_scale_a     - optional
+//   layer_norm_input_scale_b     - optional
+//   layer_norm_forget_scale_a    - optional
+//   layer_norm_forget_scale_b    - optional
+//   layer_norm_cell_scale_a      - optional
+//   layer_norm_cell_scale_b      - optional
+//   layer_norm_output_scale_a    - optional
+//   layer_norm_output_scale_b    - optional
+//
+// Scalar values:
+//   quantized_cell_clip: quantized clip value for cell.
+//   quantized_proj_clip: quantized clip value for projection.
+//   cell_scale: the power of two scale for cell state.
+//
+// Zero points:
+//   activation_zp: zero point of activation
+//   hidden_zp: zero point for hidden state.
+//
+// Temporary pre-allocated storage for the calculation. Each is of size n_cell *
+// n_batch.
+//   scratch_0
+//   scratch_1
+//   scratch_2
+//   scratch_3
+//   scratch_4
+//   scratch_5
+//   scratch_6
+//   scratch_7
+//
+// Outputs:
+//   output_state_ptr - size 'n_batch * n_output'
+//   cell_state_ptr   - size 'n_batch * n_cell'
+//   output_ptr       - size 'n_batch * n_output'
+// TODO(b/148688698): Move zero point calculation into Prepare().
+void LstmStepInteger(
+    const int8_t* input_ptr, int32_t input_zp,
+    const int8_t* input_to_input_weight_ptr,
+    int32_t effective_input_to_input_scale_a,
+    int32_t effective_input_to_input_scale_b,
+    const int8_t* input_to_forget_weight_ptr,
+    int32_t effective_input_to_forget_scale_a,
+    int32_t effective_input_to_forget_scale_b,
+    const int8_t* input_to_cell_weight_ptr,
+    int32_t effective_input_to_cell_scale_a,
+    int32_t effective_input_to_cell_scale_b,
+    const int8_t* input_to_output_weight_ptr,
+    int32_t effective_input_to_output_scale_a,
+    int32_t effective_input_to_output_scale_b,
+    const int8_t* recurrent_to_input_weight_ptr,
+    int32_t effective_recurrent_to_input_scale_a,
+    int32_t effective_recurrent_to_input_scale_b,
+    const int8_t* recurrent_to_forget_weight_ptr,
+    int32_t effective_recurrent_to_forget_scale_a,
+    int32_t effective_recurrent_to_forget_scale_b,
+    const int8_t* recurrent_to_cell_weight_ptr,
+    int32_t effective_recurrent_to_cell_scale_a,
+    int32_t effective_recurrent_to_cell_scale_b,
+    const int8_t* recurrent_to_output_weight_ptr,
+    int32_t effective_recurrent_to_output_scale_a,
+    int32_t effective_recurrent_to_output_scale_b,
+    const int8_t* cell_to_input_weight_ptr,
+    int32_t effective_cell_to_input_scale_a,
+    int32_t effective_cell_to_input_scale_b,
+    const int8_t* cell_to_forget_weight_ptr,
+    int32_t effective_cell_to_forget_scale_a,
+    int32_t effective_cell_to_forget_scale_b,
+    const int8_t* cell_to_output_weight_ptr,
+    int32_t effective_cell_to_output_scale_a,
+    int32_t effective_cell_to_output_scale_b, const int8_t* proj_weight_ptr,
+    int32_t effective_proj_scale_a, int32_t effective_proj_scale_b,
+    const int16_t* layer_norm_input_weight_ptr,
+    int32_t layer_norm_input_scale_a, int32_t layer_norm_input_scale_b,
+    const int16_t* layer_norm_forget_weight_ptr,
+    int32_t layer_norm_forget_scale_a, int32_t layer_norm_forget_scale_b,
+    const int16_t* layer_norm_cell_weight_ptr, int32_t layer_norm_cell_scale_a,
+    int32_t layer_norm_cell_scale_b,
+    const int16_t* layer_norm_output_weight_ptr,
+    int32_t layer_norm_output_scale_a, int32_t layer_norm_output_scale_b,
+    const int32_t* input_bias_ptr, const int32_t* forget_bias_ptr,
+    const int32_t* cell_bias_ptr, const int32_t* output_bias_ptr,
+    const int32_t* proj_bias_ptr, const TfLiteLSTMParams* params,
+    const int32_t* intermediate_scale_a, const int32_t* intermediate_scale_b,
+    const int32_t* intermediate_zp, int32 quantized_cell_clip,
+    int32 quantized_proj_clip, int32 n_batch, int32 n_cell, int32 n_input,
+    int32 n_output, int32 output_batch_leading_dim, int8_t* activation_ptr,
+    int32_t activation_zp, int16_t* cell_ptr, int8_t* output_ptr,
+    int8_t* scratch0, int8_t* scratch1, int16_t* scratch2, int16_t* scratch3,
+    int16_t* scratch4, int16_t* scratch5, int16_t* scratch6,
+    int16_t* scratch7) {
+  // Forget gate.
+  memset(scratch0, 0, n_batch * n_cell);
+  memset(scratch1, 0, n_batch * n_cell);
+  tensor_utils::MatrixBatchVectorMultiply(
+      input_ptr, input_zp, input_to_forget_weight_ptr,
+      effective_input_to_forget_scale_a, effective_input_to_forget_scale_b,
+      n_batch, n_input, n_cell, scratch0, intermediate_zp[4]);
+
+  tensor_utils::MatrixBatchVectorMultiply(
+      activation_ptr, activation_zp, recurrent_to_forget_weight_ptr,
+      effective_recurrent_to_forget_scale_a,
+      effective_recurrent_to_forget_scale_b, n_batch, n_output, n_cell,
+      scratch1, intermediate_zp[5]);
+
+  tensor_utils::TwoGateSaturationgAdd(
+      scratch0, intermediate_zp[4], scratch1, intermediate_zp[5],
+      intermediate_scale_a[2], intermediate_scale_b[2], intermediate_scale_a[3],
+      intermediate_scale_b[3], n_batch, n_cell, scratch2);
+
+  // Forget gate layer norm.
+  tensor_utils::ApplyLayerNormFloat(
+      scratch2, layer_norm_forget_weight_ptr, layer_norm_forget_scale_a,
+      layer_norm_forget_scale_b, forget_bias_ptr, n_batch, n_cell, scratch2);
+
+  // Forget gate sigmoid.
+  tensor_utils::ApplySigmoidFloat(scratch2, n_batch, n_cell, scratch2);
+
+  // Update gate.
+  memset(scratch0, 0, n_batch * n_cell);
+  memset(scratch1, 0, n_batch * n_cell);
+  tensor_utils::MatrixBatchVectorMultiply(
+      input_ptr, input_zp, input_to_cell_weight_ptr,
+      effective_input_to_cell_scale_a, effective_input_to_cell_scale_b, n_batch,
+      n_input, n_cell, scratch0, intermediate_zp[7]);
+
+  tensor_utils::MatrixBatchVectorMultiply(
+      activation_ptr, activation_zp, recurrent_to_cell_weight_ptr,
+      effective_recurrent_to_cell_scale_a, effective_recurrent_to_cell_scale_b,
+      n_batch, n_output, n_cell, scratch1, intermediate_zp[8]);
+
+  tensor_utils::TwoGateSaturationgAdd(
+      scratch0, intermediate_zp[7], scratch1, intermediate_zp[8],
+      intermediate_scale_a[4], intermediate_scale_b[4], intermediate_scale_a[5],
+      intermediate_scale_b[5], n_batch, n_cell, scratch3);
+
+  // Update gate with layer norm.
+  tensor_utils::ApplyLayerNormFloat(
+      scratch3, layer_norm_cell_weight_ptr, layer_norm_cell_scale_a,
+      layer_norm_cell_scale_b, cell_bias_ptr, n_batch, n_cell, scratch3);
+
+  // Update gate tanh.
+  tensor_utils::ApplyTanhFloat(scratch3, n_batch, n_cell, -12, scratch3);
+
+  // Output gate.
+  memset(scratch0, 0, n_batch * n_cell);
+  memset(scratch1, 0, n_batch * n_cell);
+  tensor_utils::MatrixBatchVectorMultiply(
+      input_ptr, input_zp, input_to_output_weight_ptr,
+      effective_input_to_output_scale_a, effective_input_to_output_scale_b,
+      n_batch, n_input, n_cell, scratch0, intermediate_zp[10]);
+
+  tensor_utils::MatrixBatchVectorMultiply(
+      activation_ptr, activation_zp, recurrent_to_output_weight_ptr,
+      effective_recurrent_to_output_scale_a,
+      effective_recurrent_to_output_scale_b, n_batch, n_output, n_cell,
+      scratch1, intermediate_zp[11]);
+
+  tensor_utils::TwoGateSaturationgAdd(
+      scratch0, intermediate_zp[10], scratch1, intermediate_zp[11],
+      intermediate_scale_a[6], intermediate_scale_b[6], intermediate_scale_a[7],
+      intermediate_scale_b[7], n_batch, n_cell, scratch4);
+
+  // Output gate with layer norm.
+  tensor_utils::ApplyLayerNormFloat(
+      scratch4, layer_norm_output_weight_ptr, layer_norm_output_scale_a,
+      layer_norm_output_scale_b, output_bias_ptr, n_batch, n_cell, scratch4);
+
+  // Output gate sigmoid.
+  tensor_utils::ApplySigmoidFloat(scratch4, n_batch, n_cell, scratch4);
+
+  // Input gate with cifg
+  tensor_utils::Sub1Vector(scratch2, n_batch * n_cell, scratch5);
+
+  // New cell.
+  tensor_utils::CwiseMul(scratch2, cell_ptr, n_batch, n_cell, 15 + 15 - 15,
+                         scratch6);
+
+  tensor_utils::CwiseMul(scratch5, scratch3, n_batch, n_cell, 15 + 15 - 15,
+                         scratch7);
+
+  tensor_utils::CwiseAdd(scratch6, scratch7, n_batch, n_cell, cell_ptr);
+
+  if (quantized_cell_clip > 0) {
+    tensor_utils::CwiseClipping(cell_ptr, quantized_cell_clip, n_batch, n_cell);
+  }
+
+  // Cell to hidden.
+  tensor_utils::ApplyTanhFloat(cell_ptr, n_batch, n_cell, -15, scratch2);
+
+  std::vector<int16_t> hidden(n_batch * n_cell);
+  tensor_utils::CwiseMul(scratch4, scratch2, n_batch, n_cell, 15 + 15 - 15,
+                         scratch3);
+
+  // Projection.
+  tensor_utils::MatrixBatchVectorMultiply(
+      scratch3, proj_weight_ptr, effective_proj_scale_a, effective_proj_scale_b,
+      proj_bias_ptr, n_batch, n_cell, n_output, activation_zp, output_ptr);
+
+  // Projection clipping.
+  if (quantized_proj_clip > 0) {
+    tensor_utils::CwiseClipping(output_ptr, quantized_proj_clip, n_batch,
+                                n_output);
+  }
+
+  // Copy output to activation.
+  memcpy(activation_ptr, output_ptr, n_batch * n_output * sizeof(int8_t));
+}
+
 }  // namespace
 
 // LINT.IfChange
@@ -1692,6 +1958,186 @@ TfLiteStatus EvalInteger8x8_16(
   return kTfLiteOk;
 }
 
+TfLiteStatus EvalInteger8x8_8(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights,
+    const TfLiteTensor* input_layer_norm_coefficients,
+    const TfLiteTensor* forget_layer_norm_coefficients,
+    const TfLiteTensor* cell_layer_norm_coefficients,
+    const TfLiteTensor* output_layer_norm_coefficients,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
+    const TfLiteLSTMParams* params, TfLiteTensor* activation_state,
+    TfLiteTensor* cell_state, TfLiteTensor* output,
+    const lstm_eval::IntegerLstmParameter* integer_lstm_param,
+    TfLiteTensor* scratch0, TfLiteTensor* scratch1, TfLiteTensor* scratch2,
+    TfLiteTensor* scratch3, TfLiteTensor* scratch4, TfLiteTensor* scratch5,
+    TfLiteTensor* scratch6, TfLiteTensor* scratch7) {
+  TF_LITE_ASSERT(input->dims->size >= 2 && input->dims->size <= 3);
+  const int n_input = input->dims->data[input->dims->size - 1];
+  int max_time, n_batch;
+  if (input->dims->size == 2) {
+    max_time = 1;
+    n_batch = input->dims->data[0];
+  } else {
+    max_time = input->dims->data[0];
+    n_batch = input->dims->data[1];
+  }
+
+  // n_cell and n_output will be the same size when there is no projection.
+  const int n_cell = input_to_output_weights->dims->data[0];
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Weights and states.
+  const int8_t* input_to_input_weight_ptr =
+      GetTensorData<int8_t>(input_to_input_weights);
+  const int8_t* recurrent_to_input_weight_ptr =
+      GetTensorData<int8_t>(recurrent_to_input_weights);
+  const int8_t* cell_to_input_weight_ptr =
+      GetTensorData<int8_t>(cell_to_input_weights);
+  const int8_t* input_to_forget_weight_ptr =
+      GetTensorData<int8_t>(input_to_forget_weights);
+  const int8_t* recurrent_to_forget_weight_ptr =
+      GetTensorData<int8_t>(recurrent_to_forget_weights);
+  const int8_t* cell_to_forget_weight_ptr =
+      GetTensorData<int8_t>(cell_to_forget_weights);
+  const int8_t* input_to_cell_weight_ptr =
+      GetTensorData<int8_t>(input_to_cell_weights);
+  const int8_t* recurrent_to_cell_weight_ptr =
+      GetTensorData<int8_t>(recurrent_to_cell_weights);
+  const int8_t* input_to_output_weight_ptr =
+      GetTensorData<int8_t>(input_to_output_weights);
+  const int8_t* recurrent_to_output_weight_ptr =
+      GetTensorData<int8_t>(recurrent_to_output_weights);
+  const int8_t* cell_to_output_weight_ptr =
+      GetTensorData<int8_t>(cell_to_output_weights);
+  const int8_t* proj_weight_ptr = GetTensorData<int8_t>(projection_weights);
+  const int16_t* layer_norm_input_weight_ptr =
+      GetTensorData<int16_t>(input_layer_norm_coefficients);
+  const int16_t* layer_norm_forget_weight_ptr =
+      GetTensorData<int16_t>(forget_layer_norm_coefficients);
+  const int16_t* layer_norm_cell_weight_ptr =
+      GetTensorData<int16_t>(cell_layer_norm_coefficients);
+  const int16_t* layer_norm_output_weight_ptr =
+      GetTensorData<int16_t>(output_layer_norm_coefficients);
+  const int32_t* input_bias_ptr = GetTensorData<int32_t>(input_gate_bias);
+  const int32_t* forget_bias_ptr = GetTensorData<int32_t>(forget_gate_bias);
+  const int32_t* cell_bias_ptr = GetTensorData<int32_t>(cell_bias);
+  const int32_t* output_bias_ptr = GetTensorData<int32_t>(output_gate_bias);
+  const int32_t* proj_bias_ptr = GetTensorData<int32_t>(projection_bias);
+  int16_t* cell_ptr = GetTensorData<int16_t>(cell_state);
+  int8_t* activation_ptr = GetTensorData<int8_t>(activation_state);
+  int8_t* output_ptr = nullptr;
+
+  const int32 input_zp = input->params.zero_point;
+  const int32 activation_zp = activation_state->params.zero_point;
+
+  // Get params for time/batch/sequence.
+  const int output_batch_leading_dim =
+      output->dims->data[output->dims->size - 1];
+  const int input_step = n_batch * n_input;
+  const int output_step = n_batch * output_batch_leading_dim;
+
+  for (int t = 0; t < max_time; t++) {
+    const int t_rel = t;
+    output_ptr = output->data.int8 + t_rel * output_step;
+
+    // Input can be int8 asymmetric or int16 symmetric.
+    const int8_t* input_ptr = input->data.int8 + t_rel * input_step;
+    lstm_eval::LstmStepInteger(
+        input_ptr, input_zp,
+
+        input_to_input_weight_ptr,
+        integer_lstm_param->effective_input_to_input_scale_a,
+        integer_lstm_param->effective_input_to_input_scale_b,
+
+        input_to_forget_weight_ptr,
+        integer_lstm_param->effective_input_to_forget_scale_a,
+        integer_lstm_param->effective_input_to_forget_scale_b,
+
+        input_to_cell_weight_ptr,
+        integer_lstm_param->effective_input_to_cell_scale_a,
+        integer_lstm_param->effective_input_to_cell_scale_b,
+
+        input_to_output_weight_ptr,
+        integer_lstm_param->effective_input_to_output_scale_a,
+        integer_lstm_param->effective_input_to_output_scale_b,
+
+        recurrent_to_input_weight_ptr,
+        integer_lstm_param->effective_recurrent_to_input_scale_a,
+        integer_lstm_param->effective_recurrent_to_input_scale_b,
+
+        recurrent_to_forget_weight_ptr,
+        integer_lstm_param->effective_recurrent_to_forget_scale_a,
+        integer_lstm_param->effective_recurrent_to_forget_scale_b,
+
+        recurrent_to_cell_weight_ptr,
+        integer_lstm_param->effective_recurrent_to_cell_scale_a,
+        integer_lstm_param->effective_recurrent_to_cell_scale_b,
+
+        recurrent_to_output_weight_ptr,
+        integer_lstm_param->effective_recurrent_to_output_scale_a,
+        integer_lstm_param->effective_recurrent_to_output_scale_b,
+
+        cell_to_input_weight_ptr,
+        integer_lstm_param->effective_cell_to_input_scale_a,
+        integer_lstm_param->effective_cell_to_input_scale_b,
+
+        cell_to_forget_weight_ptr,
+        integer_lstm_param->effective_cell_to_forget_scale_a,
+        integer_lstm_param->effective_cell_to_forget_scale_b,
+
+        cell_to_output_weight_ptr,
+        integer_lstm_param->effective_cell_to_output_scale_a,
+        integer_lstm_param->effective_cell_to_output_scale_b,
+
+        proj_weight_ptr, integer_lstm_param->effective_proj_scale_a,
+        integer_lstm_param->effective_proj_scale_b,
+
+        layer_norm_input_weight_ptr,
+        integer_lstm_param->layer_norm_input_scale_a,
+        integer_lstm_param->layer_norm_input_scale_b,
+
+        layer_norm_forget_weight_ptr,
+        integer_lstm_param->layer_norm_forget_scale_a,
+        integer_lstm_param->layer_norm_forget_scale_b,
+
+        layer_norm_cell_weight_ptr, integer_lstm_param->layer_norm_cell_scale_a,
+        integer_lstm_param->layer_norm_cell_scale_b,
+
+        layer_norm_output_weight_ptr,
+        integer_lstm_param->layer_norm_output_scale_a,
+        integer_lstm_param->layer_norm_output_scale_b,
+
+        input_bias_ptr, forget_bias_ptr, cell_bias_ptr, output_bias_ptr,
+        proj_bias_ptr,
+
+        params, integer_lstm_param->intermediate_scale_a,
+        integer_lstm_param->intermediate_scale_b,
+        integer_lstm_param->intermediate_zp,
+        integer_lstm_param->quantized_cell_clip,
+        integer_lstm_param->quantized_proj_clip, n_batch, n_cell, n_input,
+        n_output, output_batch_leading_dim, activation_ptr, activation_zp,
+        cell_ptr, output_ptr, GetTensorData<int8_t>(scratch0),
+        GetTensorData<int8_t>(scratch1), GetTensorData<int16_t>(scratch2),
+        GetTensorData<int16_t>(scratch3), GetTensorData<int16_t>(scratch4),
+        GetTensorData<int16_t>(scratch5), GetTensorData<int16_t>(scratch6),
+        GetTensorData<int16_t>(scratch7));
+  }
+
+  return kTfLiteOk;
+}
+
 }  // namespace lstm_eval
 }  // namespace builtin
 }  // namespace ops
diff --git a/tensorflow/lite/kernels/lstm_eval.h b/tensorflow/lite/kernels/lstm_eval.h
index c61d396bb33..ca3f96391aa 100644
--- a/tensorflow/lite/kernels/lstm_eval.h
+++ b/tensorflow/lite/kernels/lstm_eval.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -28,7 +28,8 @@ namespace ops {
 namespace builtin {
 namespace lstm_eval {
 
-// Pamameters for quantized lstm.
+// Pamameters for integer LSTM.
+// Consider split this into two Integer Parameters if more fields are added.
 struct IntegerLstmParameter {
   int32_t effective_input_to_input_scale_a;
   int32_t effective_input_to_input_scale_b;
@@ -75,24 +76,24 @@ struct IntegerLstmParameter {
   int32_t cell_variance_guard;
   int32_t output_variance_guard;
 
-  // The fields are used for pre-computing zero_point * weight.
-  // We cannot use temporary tensors since temporary tensors are not alllocated
-  // yet until end of prepare.
-
-  // Forget gate.
+  // Pre-calculate bias + zero_point * weight.
+  // Unabled to use temporary tensors since those are used in Prepare() and
+  // scratch buffer is only allocated after Preapre().
   std::unique_ptr<int32_t[]> input_to_forget_effective_bias;
   std::unique_ptr<int32_t[]> recurrent_to_forget_effective_bias;
-  // Modulation gate.
   std::unique_ptr<int32_t[]> input_to_cell_effective_bias;
   std::unique_ptr<int32_t[]> recurrent_to_cell_effective_bias;
-  // Output gate.
   std::unique_ptr<int32_t[]> input_to_output_effective_bias;
   std::unique_ptr<int32_t[]> recurrent_to_output_effective_bias;
-  // Input gate.
   std::unique_ptr<int32_t[]> input_to_input_effective_bias;
   std::unique_ptr<int32_t[]> recurrent_to_input_effective_bias;
-  // Projection.
   std::unique_ptr<int32_t[]> projection_effective_bias;
+
+  // Scale and zero point for intermediate tensors.
+  // Used only in the 8x8_8 case.
+  int32_t intermediate_scale_a[8];
+  int32_t intermediate_scale_b[8];
+  int32_t intermediate_zp[12];
 };
 
 TfLiteStatus EvalFloat(
@@ -183,6 +184,32 @@ TfLiteStatus EvalInteger8x8_16(
     TfLiteTensor* scratch2, TfLiteTensor* scratch3, TfLiteTensor* scratch4,
     TfLiteTensor* scratch5, CpuBackendContext* context);
 
+TfLiteStatus EvalInteger8x8_8(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights,
+    const TfLiteTensor* input_layer_norm_coefficients,
+    const TfLiteTensor* forget_layer_norm_coefficients,
+    const TfLiteTensor* cell_layer_norm_coefficients,
+    const TfLiteTensor* output_layer_norm_coefficients,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
+    const TfLiteLSTMParams* params, TfLiteTensor* activation_state,
+    TfLiteTensor* cell_state, TfLiteTensor* output,
+    const lstm_eval::IntegerLstmParameter* integer_lstm_param,
+    TfLiteTensor* scratch0, TfLiteTensor* scratch1, TfLiteTensor* scratch2,
+    TfLiteTensor* scratch3, TfLiteTensor* scratch4, TfLiteTensor* scratch5,
+    TfLiteTensor* scratch6, TfLiteTensor* scratch7);
+
 }  // namespace lstm_eval
 }  // namespace builtin
 }  // namespace ops
diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
index 6c00fc8dc68..f426ffae0e0 100644
--- a/tensorflow/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -2756,6 +2756,483 @@ TEST(LSTMIntegerOpModel, NoCifgYesLayerNormNoYesProjectionYesPeephole) {
   }
 }
 
+class LSTMIntegerOpModel8x8_8 : public SingleOpModel {
+ public:
+  LSTMIntegerOpModel8x8_8(
+      int n_batch, int n_input, int n_cell, int n_output, bool use_cifg,
+      bool use_peephole, bool use_projection_weights, bool use_projection_bias,
+      bool use_layer_norm, float cell_clip, float proj_clip,
+      const std::vector<std::vector<int>>& input_shapes,
+      const std::vector<std::pair<float, float>>& ranges,
+      const std::vector<std::pair<float, int>>& intermediates)
+      : n_batch_(n_batch),
+        n_input_(n_input),
+        n_cell_(n_cell),
+        n_output_(n_output) {
+    EXPECT_EQ(input_shapes.size() + 1, ranges.size());
+    EXPECT_EQ(intermediates.size(), 12);
+    input_ = AddInput(
+        {TensorType_INT8, input_shapes[0], ranges[0].first, ranges[0].second});
+
+    if (use_cifg) {
+      input_to_input_weights_ = AddNullInput();
+    } else {
+      input_to_input_weights_ = AddInput({TensorType_INT8, input_shapes[1],
+                                          ranges[1].first, ranges[1].second});
+    }
+    input_to_forget_weights_ = AddInput(
+        {TensorType_INT8, input_shapes[2], ranges[2].first, ranges[2].second});
+    input_to_cell_weights_ = AddInput(
+        {TensorType_INT8, input_shapes[3], ranges[3].first, ranges[3].second});
+    input_to_output_weights_ = AddInput(
+        {TensorType_INT8, input_shapes[4], ranges[4].first, ranges[4].second});
+
+    if (use_cifg) {
+      recurrent_to_input_weights_ = AddNullInput();
+    } else {
+      recurrent_to_input_weights_ =
+          AddInput({TensorType_INT8, input_shapes[5], ranges[5].first,
+                    ranges[5].second});
+    }
+    recurrent_to_forget_weights_ = AddInput(
+        {TensorType_INT8, input_shapes[6], ranges[6].first, ranges[6].second});
+    recurrent_to_cell_weights_ = AddInput(
+        {TensorType_INT8, input_shapes[7], ranges[7].first, ranges[7].second});
+    recurrent_to_output_weights_ = AddInput(
+        {TensorType_INT8, input_shapes[8], ranges[8].first, ranges[8].second});
+
+    if (use_peephole) {
+      if (use_cifg) {
+        cell_to_input_weights_ = AddNullInput();
+      } else {
+        cell_to_input_weights_ = AddInput({TensorType_INT16, input_shapes[9],
+                                           ranges[9].first, ranges[9].second});
+      }
+      cell_to_forget_weights_ = AddInput({TensorType_INT16, input_shapes[10],
+                                          ranges[10].first, ranges[10].second});
+      cell_to_output_weights_ = AddInput({TensorType_INT16, input_shapes[11],
+                                          ranges[11].first, ranges[11].second});
+    } else {
+      cell_to_input_weights_ = AddNullInput();
+      cell_to_forget_weights_ = AddNullInput();
+      cell_to_output_weights_ = AddNullInput();
+    }
+
+    if (use_cifg) {
+      input_gate_bias_ = AddNullInput();
+    } else {
+      input_gate_bias_ = AddInput({TensorType_INT32, input_shapes[12],
+                                   ranges[12].first, ranges[12].second});
+    }
+    forget_gate_bias_ = AddInput({TensorType_INT32, input_shapes[13],
+                                  ranges[13].first, ranges[13].second});
+    cell_bias_ = AddInput({TensorType_INT32, input_shapes[14], ranges[14].first,
+                           ranges[14].second});
+    output_gate_bias_ = AddInput({TensorType_INT32, input_shapes[15],
+                                  ranges[15].first, ranges[15].second});
+
+    if (use_projection_weights) {
+      projection_weights_ = AddInput({TensorType_INT8, input_shapes[16],
+                                      ranges[16].first, ranges[16].second});
+      if (use_projection_bias) {
+        projection_bias_ = AddInput({TensorType_INT32, input_shapes[17],
+                                     ranges[17].first, ranges[17].second});
+      } else {
+        projection_bias_ = AddNullInput();
+      }
+    } else {
+      projection_weights_ = AddNullInput();
+      projection_bias_ = AddNullInput();
+    }
+
+    // Adding the 2 input state tensors.
+    input_activation_state_ = AddInput({TensorType_INT16, input_shapes[18],
+                                        ranges[18].first, ranges[18].second},
+                                       true);
+    input_cell_state_ = AddInput({TensorType_INT16, input_shapes[19],
+                                  ranges[19].first, ranges[19].second},
+                                 true);
+
+    // Layer norm weights.
+    if (use_layer_norm) {
+      if (use_cifg) {
+        input_layer_norm_coefficients_ = AddNullInput();
+      } else {
+        input_layer_norm_coefficients_ =
+            AddInput({TensorType_INT16, input_shapes[20], ranges[20].first,
+                      ranges[20].second});
+      }
+      forget_layer_norm_coefficients_ =
+          AddInput({TensorType_INT16, input_shapes[21], ranges[21].first,
+                    ranges[21].second});
+      cell_layer_norm_coefficients_ =
+          AddInput({TensorType_INT16, input_shapes[22], ranges[22].first,
+                    ranges[22].second});
+      output_layer_norm_coefficients_ =
+          AddInput({TensorType_INT16, input_shapes[23], ranges[23].first,
+                    ranges[23].second});
+    }
+
+    for (int i = 0; i < intermediates.size(); ++i) {
+      intermediates_[i] =
+          AddIntermediate(TensorType_INT16, {intermediates[i].first},
+                          {intermediates[i].second});
+    }
+
+    output_ = AddOutput({TensorType_INT8,
+                         {n_batch, n_output},
+                         ranges[24].first,
+                         ranges[24].second});
+
+    SetBuiltinOp(BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions,
+                 CreateLSTMOptions(builder_, ActivationFunctionType_TANH,
+                                   cell_clip, proj_clip)
+                     .Union());
+
+    // Do not apply delegate yet since tensor values are not known (and more
+    // specifically scales in quantized tensors are not known).
+    BuildInterpreter(input_shapes, /*allow_fp32_relax_to_fp16=*/false,
+                     /*apply_delegate=*/false);
+  }
+
+  void SetInputToInputWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(input_to_input_weights_, f);
+  }
+
+  void SetInputToForgetWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(input_to_forget_weights_, f);
+  }
+
+  void SetInputToCellWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(input_to_cell_weights_, f);
+  }
+
+  void SetInputToOutputWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(input_to_output_weights_, f);
+  }
+
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(recurrent_to_input_weights_, f);
+  }
+
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(recurrent_to_forget_weights_, f);
+  }
+
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(recurrent_to_cell_weights_, f);
+  }
+
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(recurrent_to_output_weights_, f);
+  }
+
+  void SetCellToInputWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int16_t>(cell_to_input_weights_, f);
+  }
+
+  void SetCellToForgetWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int16_t>(cell_to_forget_weights_, f);
+  }
+
+  void SetCellToOutputWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int16_t>(cell_to_output_weights_, f);
+  }
+
+  void SetInputLayerNormCoefficients(const std::vector<float>& f) {
+    QuantizeAndPopulate<int16_t>(input_layer_norm_coefficients_, f);
+  }
+
+  void SetForgetLayerNormCoefficients(const std::vector<float>& f) {
+    QuantizeAndPopulate<int16_t>(forget_layer_norm_coefficients_, f);
+  }
+
+  void SetCellLayerNormCoefficients(const std::vector<float>& f) {
+    QuantizeAndPopulate<int16_t>(cell_layer_norm_coefficients_, f);
+  }
+
+  void SetOutputLayerNormCoefficients(const std::vector<float>& f) {
+    QuantizeAndPopulate<int16_t>(output_layer_norm_coefficients_, f);
+  }
+
+  void SetInputGateBias(const std::vector<float>& f) {
+    QuantizeAndPopulate<int32_t>(input_gate_bias_, f);
+  }
+
+  void SetForgetGateBias(const std::vector<float>& f) {
+    QuantizeAndPopulate<int32_t>(forget_gate_bias_, f);
+  }
+
+  void SetCellBias(const std::vector<float>& f) {
+    QuantizeAndPopulate<int32_t>(cell_bias_, f);
+  }
+
+  void SetOutputGateBias(const std::vector<float>& f) {
+    QuantizeAndPopulate<int32_t>(output_gate_bias_, f);
+  }
+
+  void SetProjectionWeights(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(projection_weights_, f);
+  }
+
+  void SetProjectionBias(const std::vector<float>& f) {
+    QuantizeAndPopulate<int32_t>(projection_bias_, f);
+  }
+
+  void SetInput(const std::vector<float>& f) {
+    QuantizeAndPopulate<int8_t>(input_, f);
+  }
+
+  std::vector<int8_t> GetOutput() { return ExtractVector<int8_t>(output_); }
+
+  int num_inputs() { return n_input_; }
+  int num_outputs() { return n_output_; }
+  int num_cells() { return n_cell_; }
+  int num_batches() { return n_batch_; }
+
+ protected:
+  int input_;
+  int input_to_input_weights_;
+  int input_to_forget_weights_;
+  int input_to_cell_weights_;
+  int input_to_output_weights_;
+
+  int recurrent_to_input_weights_;
+  int recurrent_to_forget_weights_;
+  int recurrent_to_cell_weights_;
+  int recurrent_to_output_weights_;
+
+  int cell_to_input_weights_;
+  int cell_to_forget_weights_;
+  int cell_to_output_weights_;
+
+  int input_layer_norm_coefficients_;
+  int forget_layer_norm_coefficients_;
+  int cell_layer_norm_coefficients_;
+  int output_layer_norm_coefficients_;
+
+  int input_gate_bias_;
+  int forget_gate_bias_;
+  int cell_bias_;
+  int output_gate_bias_;
+
+  int projection_weights_;
+  int projection_bias_;
+  int input_activation_state_;
+  int input_cell_state_;
+
+  int intermediates_[12];
+
+  int output_;
+  int output_state_;
+  int cell_state_;
+
+  int n_batch_;
+  int n_input_;
+  int n_cell_;
+  int n_output_;
+};
+
+TEST(LSTMIntegerOpModel8x8_8, CifgYesLayerNormNoYesProjectionNoPeephole) {
+  // Hyper parameters.
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+  const float cell_clip = 0.0;
+  const float proj_clip = 0.0;
+
+  // Model related weights.
+  const std::vector<float> input_to_input_weights = {
+      0.5,  0.6, 0.7,  -0.8, -0.9, 0.1,  0.2,  0.3,  -0.4, 0.5,
+      -0.8, 0.7, -0.6, 0.5,  -0.4, -0.5, -0.4, -0.3, -0.2, -0.1};
+
+  const std::vector<float> input_to_forget_weights = {
+      -0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2, -0.4, 0.3,  -0.8,
+      -0.4, 0.3,  -0.5, -0.4, -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
+
+  const std::vector<float> input_to_cell_weights = {
+      -0.4, -0.3, -0.2, -0.1, -0.5, 0.5, -0.2, -0.3, -0.2, -0.6,
+      0.6,  -0.1, -0.4, -0.3, -0.7, 0.7, -0.9, -0.5, 0.8,  0.6};
+
+  const std::vector<float> input_to_output_weights = {
+      -0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3, -0.3, -0.8, -0.2,
+      0.6,  -0.2, 0.4,  -0.7, -0.3, -0.5, 0.1, 0.5,  -0.6, -0.4};
+
+  const std::vector<float> input_gate_bias = {0.03, 0.15, 0.22, 0.38};
+
+  const std::vector<float> forget_gate_bias = {0.1, -0.3, -0.2, 0.1};
+
+  const std::vector<float> cell_gate_bias = {-0.05, 0.72, 0.25, 0.08};
+
+  const std::vector<float> output_gate_bias = {0.05, -0.01, 0.2, 0.1};
+
+  const std::vector<float> recurrent_to_input_weights = {
+      -0.2, -0.3, 0.4, 0.1, -0.5, 0.9, -0.2, -0.3, -0.7, 0.05, -0.2, -0.6};
+
+  const std::vector<float> recurrent_to_cell_weights = {
+      -0.3, 0.2, 0.1, -0.3, 0.8, -0.08, -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
+
+  const std::vector<float> recurrent_to_forget_weights = {
+      -0.5, -0.3, -0.5, -0.2, 0.6, 0.4, 0.9, 0.3, -0.1, 0.2, 0.5, 0.2};
+
+  const std::vector<float> recurrent_to_output_weights = {
+      0.3, -0.1, 0.1, -0.2, -0.5, -0.7, -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
+
+  const std::vector<float> input_layer_norm_coefficients = {0.1, 0.2, 0.3, 0.5};
+  const std::vector<float> forget_layer_norm_coefficients = {0.2, 0.2, 0.4,
+                                                             0.3};
+  const std::vector<float> cell_layer_norm_coefficients = {0.7, 0.2, 0.3, 0.8};
+  const std::vector<float> output_layer_norm_coefficients = {0.6, 0.2, 0.2,
+                                                             0.5};
+
+  const std::vector<float> projection_weights = {
+      -0.1, 0.2, 0.01, -0.2, 0.1, 0.5, 0.3, 0.08, 0.07, 0.2, -0.4, 0.2};
+  const std::vector<float> projection_bias = {0.1, 0.3, 0.5};
+
+  // Input shapes.
+  const std::vector<std::vector<int32_t>> inputs = {
+      {n_batch, n_input},  // input tensor
+
+      {0},                // input_to_input_weight tensor
+      {n_cell, n_input},  // input_to_forget_weight tensor
+      {n_cell, n_input},  // input_to_cell_weight tensor
+      {n_cell, n_input},  // input_to_output_weight tensor
+
+      {0},                 // recurrent_to_input_weight tensor
+      {n_cell, n_output},  // recurrent_to_forget_weight tensor
+      {n_cell, n_output},  // recurrent_to_cell_weight tensor
+      {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+      {0},  // cell_to_input_weight tensor
+      {0},  // cell_to_forget_weight tensor
+      {0},  // cell_to_output_weight tensor
+
+      {0},       // input_gate_bias tensor
+      {n_cell},  // forget_gate_bias tensor
+      {n_cell},  // cell_bias tensor
+      {n_cell},  // output_gate_bias tensor
+
+      {n_output, n_cell},  // projection_weight tensor
+      {n_output},          // projection_bias tensor
+
+      {n_batch, n_output},  // activation_state tensor
+      {n_batch, n_cell},    // cell_state tensor
+
+      {0},       // input_layer_norm_coefficient tensor
+      {n_cell},  // forget_layer_norm_coefficient tensor
+      {n_cell},  // cell_layer_norm_coefficient tensor
+      {n_cell},  // output_layer_norm_coefficient tensor
+  };
+
+  // Input ranges.
+  const std::vector<std::pair<float, float>> ranges = {
+      {-1.0, 127.0 / 128},  // input tensor
+      {-1.0, 1.0},          // input_to_input_weight tensor
+      {-1.0, 1.0},          // input_to_forget_weight tensor
+      {-1.0, 1.0},          // input_to_cell_weight tensor
+      {-1.0, 1.0},          // input_to_output_weight tensor
+
+      {-1.0, 1.0},  // recurrent_to_input_weight tensor
+      {-1.0, 1.0},  // recurrent_to_forget_weight tensor
+      {-1.0, 1.0},  // recurrent_to_cell_weight tensor
+      {-1.0, 1.0},  // recurrent_to_output_weight tensor
+
+      {-1, 1},  // cell_to_input_weight tensor
+      {-1, 1},  // cell_to_forget_weight tensor
+      {-1, 1},  // cell_to_output_weight tensor
+
+      {-100, 100},  // input_gate_bias tensor
+      {-100, 100},  // forget_gate_bias tensor
+      {-100, 100},  // cell_bias tensor
+      {-100, 100},  // output_gate_bias tensor
+
+      {-0.5, 0.5},  // projection_weight tensor
+      {-1, 1},      // projection_bias tensor
+
+      {-1.0, 32767.0 / 32768},  // activation_state tensor
+      {-1.0, 32767.0 / 32768},  // cell_state tensor
+
+      {-1.00001, 1.0},  // input_layer_norm_coefficient tensor
+      {-1.00001, 1.0},  // forget_layer_norm_coefficient tensor
+      {-1.00001, 1.0},  // cell_layer_norm_coefficient tensor
+      {-1.00001, 1.0},  // output_layer_norm_coefficient tensor
+      // Output scale is the same as input activation scale and only activation
+      // scale is used in the op, so this is only provided for clarity.
+      {-1.0, 32767.0 / 32768},  // output tensor.
+  };
+
+  // The scale and zero point of intermediate tensors.
+  std::vector<std::pair<float, int>> intermediates = {
+      {0.007059, 0}, {0.007812, 0}, {0.007059, 0}, {0.007812, 0},
+      {0.007, 0},    {0.007059, 0}, {0.007, 0},    {0.007, 0},
+      {0.007059, 0}, {0.007, 0},    {0.007, 0},    {0.3, 0}};
+
+  // Create model.
+  LSTMIntegerOpModel8x8_8 lstm(n_batch, n_input, n_cell, n_output,
+                               /*use_cifg=*/true, /*use_peephole=*/false,
+                               /*use_projection_weights=*/true,
+                               /*use_projection_bias=*/true,
+                               /*use_layer_norm=*/true, cell_clip, proj_clip,
+                               inputs, ranges, intermediates);
+
+  // Set weights.
+  // lstm.SetInputToInputWeights(input_to_input_weights);
+  lstm.SetInputToCellWeights(input_to_cell_weights);
+  lstm.SetInputToForgetWeights(input_to_forget_weights);
+  lstm.SetInputToOutputWeights(input_to_output_weights);
+
+  // lstm.SetInputGateBias(input_gate_bias);
+  lstm.SetCellBias(cell_gate_bias);
+  lstm.SetForgetGateBias(forget_gate_bias);
+  lstm.SetOutputGateBias(output_gate_bias);
+
+  // lstm.SetRecurrentToInputWeights(recurrent_to_input_weights);
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights);
+
+  lstm.SetProjectionWeights(projection_weights);
+  lstm.SetProjectionBias(projection_bias);
+
+  // lstm.SetInputLayerNormCoefficients(input_layer_norm_coefficients);
+  lstm.SetForgetLayerNormCoefficients(forget_layer_norm_coefficients);
+  lstm.SetCellLayerNormCoefficients(cell_layer_norm_coefficients);
+  lstm.SetOutputLayerNormCoefficients(output_layer_norm_coefficients);
+
+  // Model inputs. sequence -batch - input
+  const std::vector<std::vector<float>> lstm_input = {
+      {
+          0.7, 0.8, 0.1, 0.2, 0.3,  //
+          0.8, 0.1, 0.2, 0.4, 0.5,  //
+      },
+      {
+          0.2, 0.7, 0.7, 0.1, 0.7,  //
+          0.3, 0.2, 0.9, 0.8, 0.1,  //
+      },
+      {
+          0.7, 0.8, 0.1, 0.2, 0.3,  //
+          0.3, 0.2, 0.9, 0.8, 0.1,  //
+      },
+  };
+
+  // Expected outputs.
+  const std::vector<std::vector<int8_t>> expected_output = {
+      {127, 127, 127, 127, 127, 127},
+      {127, 127, 127, 127, 127, 127},
+      {127, 127, 127, 127, 127, 127},
+  };
+
+  // Invoke and verify the result.
+  const int input_sequence_size = lstm_input.size();
+  EXPECT_GT(input_sequence_size, 0);
+  for (int i = 0; i < input_sequence_size; ++i) {
+    lstm.SetInput(lstm_input[i]);
+    lstm.Invoke();
+    EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(expected_output[i]));
+  }
+}
+
 #ifdef GTEST_HAS_DEATH_TEST
 TEST(LSTMOpModel, InvalidTypeTest) {
   const int n_batch = 1;

From 0685f70521018a653f22530c900a1253d4fa3963 Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Thu, 20 Feb 2020 16:11:36 -0800
Subject: [PATCH 400/442] Automated rollback of commit
 0d2f3be5ebe4c762dddad2fe1bac1b4af538de2c

PiperOrigin-RevId: 296320816
Change-Id: Ib8b5857178fa10513755de65ffcde1adf6dabad3
---
 .../python/keras/layers/preprocessing/BUILD   |  6 +--
 .../layers/preprocessing/index_lookup.py      | 46 +++++++++++++++---
 .../layers/preprocessing/index_lookup_test.py | 47 +++++++++++++++++--
 3 files changed, 85 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index 720e92483fb..e0dd9114755 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -303,10 +303,9 @@ cuda_py_test(
 )
 
 tf_py_test(
-    name = "preprocessing_normalization_test",
+    name = "normalization_test",
     size = "small",
     srcs = ["normalization_test.py"],
-    main = "normalization_test.py",
     python_version = "PY3",
     deps = [
         ":normalization",
@@ -317,10 +316,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "preprocessing_text_vectorization_test",
+    name = "text_vectorization_test",
     size = "medium",
     srcs = ["text_vectorization_test.py"],
-    main = "text_vectorization_test.py",
     python_version = "PY3",
     deps = [
         ":preprocessing_test_utils",
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup.py b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
index 7bd7f6683d1..e8c2c0aefc6 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import gfile
 from tensorflow.python.util import compat
 
 # The string tokens in the extracted vocabulary
@@ -66,7 +67,13 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
       1. If this value is more than 1, OOV inputs are hashed to determine their
       OOV value; if this value is 0, passing an OOV input will result in a
       runtime error.
-    vocabulary: An optional list of vocabulary terms.
+    vocabulary: An optional list of vocabulary terms, or a path to a text file
+      containing a vocabulary to load into this layer. The file should contain
+      one token per line. In either case, the vocabulary must be unique; if
+      the list or file contains the same token multiple times, an error will
+      be thrown. Note that when passing a vocabulary - either as a list or as
+      a file - the vocabulary will not be present in the layer's config dict;
+      it will instead be a part of the layer's weights.
     reserve_zero: Whether to reserve the index 0, which indicates pad values in
       the Keras masking system. If True, the output of this layer will be in the
       range `[1...max_tokens+1)`; if False, the output will be in the range
@@ -164,10 +171,38 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     self._inverse_table = None
 
     if vocabulary is not None:
-      self._export_vocab = True
+      if isinstance(vocabulary, str):
+        vocabulary = self._get_vocabulary_from_file(vocabulary)
+
+      vocabulary_set = set(vocabulary)
+      if len(vocabulary) != len(vocabulary_set):
+        repeated_items = [
+            item for item, count in collections.Counter(vocabulary).items()
+            if count > 1
+        ]
+        raise ValueError("The passed vocabulary has at least one repeated "
+                         "term. Please uniquify your dataset before passing "
+                         "it to IndexLookup(). The repeated terms are %s" %
+                         repeated_items)
       self.set_vocabulary(vocabulary)
-    else:
-      self._export_vocab = False
+
+  def _get_vocabulary_from_file(self, vocabulary_path):
+    vocab = []
+    with gfile.GFile(vocabulary_path, "r") as reader:
+      while True:
+        # Get the next line, and break if it is None.
+        text = reader.readline()
+        if not text:
+          break
+
+        # Convert the raw text into UTF8 and strip whitespace.
+        if isinstance(text, str):
+          token = text
+        elif isinstance(text, bytes):
+          token = text.decode("utf-8", "ignore")
+        token = token.strip()
+        vocab.append(token)
+    return vocab
 
   def _get_table_data(self):
     keys, values = self._table.export()
@@ -256,11 +291,10 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     return [x for _, x in sorted(zip(values, keys))]
 
   def get_config(self):
-    vocabulary = self.get_vocabulary() if self._export_vocab else None
     config = {
         "max_tokens": self.max_tokens,
         "num_oov_tokens": self.num_oov_tokens,
-        "vocabulary": vocabulary,
+        "vocabulary": None,
         "reserve_zero": self.reserve_zero,
         "mask_zero": self.mask_zero,
     }
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
index fbb6062ce0b..96a7e7c547f 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.keras.layers.preprocessing import preprocessing_test_util
 from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 
 
@@ -356,7 +357,22 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
     output_dataset = model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
-  def test_int_output_explicit_vocab_from_config(self):
+
+@keras_parameterized.run_all_keras_modes
+class IndexLookupVocabularyTest(keras_parameterized.TestCase,
+                                preprocessing_test_utils.PreprocessingLayerTest
+                               ):
+
+  def _write_to_temp_file(self, file_name, vocab_list):
+    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+    with gfile.GFile(vocab_path, "w") as writer:
+      for vocab in vocab_list:
+        writer.write(vocab + "\n")
+      writer.flush()
+      writer.close()
+    return vocab_path
+
+  def test_int_output_explicit_vocab(self):
     vocab_data = ["earth", "wind", "and", "fire"]
     input_array = np.array([["earth", "wind", "and", "fire"],
                             ["fire", "and", "earth", "michigan"]])
@@ -366,10 +382,22 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
     layer = get_layer_class()(vocabulary=vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
 
-    with CustomObjectScope({"IndexLookup": get_layer_class()}):
-      new_model = keras.Model.from_config(model.get_config())
-    output_dataset = new_model.predict(input_array)
+  def test_int_output_explicit_vocab_from_file(self):
+    vocab_list = ["earth", "wind", "and", "fire"]
+    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(vocabulary=vocab_path)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
   def test_vocab_appending(self):
@@ -387,6 +415,17 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
     output_dataset = model.predict(input_array)
     self.assertAllClose(expected_output, output_dataset)
 
+  def test_non_unique_vocab_fails(self):
+    vocab_data = ["earth", "wind", "and", "fire", "fire"]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
+      _ = get_layer_class()(vocabulary=vocab_data)
+
+  def test_non_unique_vocab_from_file_fails(self):
+    vocab_list = ["earth", "wind", "and", "fire", "earth"]
+    vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
+      _ = get_layer_class()(vocabulary=vocab_path)
+
 
 @keras_parameterized.run_all_keras_modes
 class InverseLookupOutputTest(keras_parameterized.TestCase,

From 0213d7a4d642ae191210b8afa33ef2c090b81be5 Mon Sep 17 00:00:00 2001
From: Juhyun Lee <impjdi@google.com>
Date: Thu, 20 Feb 2020 16:14:26 -0800
Subject: [PATCH 401/442] TFLite GPU: Implement SPACE_TO_DEPTH.

PiperOrigin-RevId: 296321368
Change-Id: I3b5844fde83ef48002a4c326eeb745587068c208
---
 .../lite/delegates/gpu/cl/kernels/BUILD       |  33 ++++
 .../gpu/cl/kernels/space_to_depth.cc          | 141 ++++++++++++++++
 .../delegates/gpu/cl/kernels/space_to_depth.h |  58 +++++++
 .../gpu/cl/kernels/space_to_depth_test.cc     | 144 +++++++++++++++++
 .../gpu/cl/selectors/operation_selector.cc    |  14 +-
 .../gpu/cl/selectors/simple_selectors.cc      |   8 +
 .../gpu/cl/selectors/simple_selectors.h       |   4 +
 .../delegates/gpu/common/model_builder.cc     | 134 +++++++++------
 .../lite/delegates/gpu/common/operations.cc   |   3 +
 .../lite/delegates/gpu/common/operations.h    |   5 +
 .../lite/delegates/gpu/gl/kernels/BUILD       |  30 ++++
 .../gpu/gl/kernels/space_to_depth.cc          |  74 +++++++++
 .../delegates/gpu/gl/kernels/space_to_depth.h |  33 ++++
 .../gpu/gl/kernels/space_to_depth_test.cc     | 104 ++++++++++++
 tensorflow/lite/delegates/gpu/metal/api.cc    |  12 ++
 .../lite/delegates/gpu/metal/kernels/BUILD    |  37 +++++
 .../gpu/metal/kernels/space_to_depth.cc       | 129 +++++++++++++++
 .../gpu/metal/kernels/space_to_depth.h        |  37 +++++
 .../gpu/metal/kernels/space_to_depth_test.mm  | 153 ++++++++++++++++++
 19 files changed, 1101 insertions(+), 52 deletions(-)
 create mode 100644 tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc
 create mode 100644 tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h
 create mode 100644 tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth_test.cc
 create mode 100644 tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.cc
 create mode 100644 tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.h
 create mode 100644 tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth_test.cc
 create mode 100644 tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.cc
 create mode 100644 tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.h
 create mode 100644 tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth_test.mm

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index 4076213cd23..ea193c76ea8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -1193,6 +1193,39 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "space_to_depth",
+    srcs = ["space_to_depth.cc"],
+    hdrs = ["space_to_depth.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "space_to_depth_test",
+    srcs = ["space_to_depth_test.cc"],
+    linkstatic = True,
+    tags = tf_gpu_tests_tags() + [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":space_to_depth",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "strided_slice",
     srcs = ["strided_slice.cc"],
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc
new file mode 100644
index 00000000000..db6882ce4f4
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.cc
@@ -0,0 +1,141 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+std::string GetSpaceToDepthCode(
+    const OperationDef& op_def,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  TensorCodeGenerator src_tensor(
+      "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
+      op_def.src_tensors[0]);
+  TensorCodeGenerator dst_tensor(
+      "dst_data", WHSPoint{"dst_size.x", "dst_size.y", "dst_size.z"},
+      op_def.dst_tensors[0]);
+  std::string c = GetCommonDefines(op_def.precision);
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ);
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,\n";
+  c += "    int4 dst_size,\n";
+  c += "    int src_channels,\n";
+  c += "    int block_size) {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  int Z = get_global_id(2);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;\n";
+  c += "  FLT tmp[4];\n";
+  c += "  tmp[0] = (FLT)(0.0f);\n";
+  c += "  tmp[1] = (FLT)(0.0f);\n";
+  c += "  tmp[2] = (FLT)(0.0f);\n";
+  c += "  tmp[3] = (FLT)(0.0f);\n";
+  c += "  for (int i = 0; i < 4; ++i) {\n";
+  c += "    int dst_c = 4 * Z + i;\n";
+  c += "    int block_id = dst_c / src_channels;\n";
+  c += "    int src_x = X * block_size + block_id % block_size;\n";
+  c += "    int src_y = Y * block_size + block_id / block_size;\n";
+  c += "    int src_c = dst_c % src_channels;\n";
+  c += "    int src_z = src_c / 4;\n";
+  c += "    FLT4 t = " + src_tensor.ReadWHS("src_x", "src_y", "src_z") + ";\n";
+  c += "    FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
+  c += "    tmp[i] = t_ar[src_c % 4];\n";
+  c += "  }\n";
+  c += "  FLT4 result = (FLT4)(tmp[0], tmp[1], tmp[2], tmp[3]);\n";
+  const LinkingContext context = {
+      .var_name = "result",
+      .x_coord = "X",
+      .y_coord = "Y",
+      .s_coord = "Z",
+  };
+  c += PostProcess(linked_operations, context);
+  c += "  " + dst_tensor.WriteWHS("result", "X", "Y", "Z");
+  c += "}\n";
+  return c;
+}
+
+}  // namespace
+
+SpaceToDepth::SpaceToDepth(SpaceToDepth&& operation)
+    : GPUOperation(std::move(operation)),
+      attr_(operation.attr_),
+      kernel_(std::move(operation.kernel_)),
+      work_group_size_(operation.work_group_size_) {}
+
+SpaceToDepth& SpaceToDepth::operator=(SpaceToDepth&& operation) {
+  if (this != &operation) {
+    attr_ = operation.attr_;
+    kernel_ = std::move(operation.kernel_);
+    std::swap(work_group_size_, operation.work_group_size_);
+    GPUOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+Status SpaceToDepth::Compile(const CreationContext& creation_context) {
+  const auto code = GetSpaceToDepthCode(definition_, linked_operations_);
+  return creation_context.cache->GetOrCreateCLKernel(
+      code, "main_function", *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
+Status SpaceToDepth::BindArguments() {
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Channels()));
+  return kernel_.SetBytesAuto(attr_.block_size);
+}
+
+int3 SpaceToDepth::GetGridSize() const {
+  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
+  const int grid_y = dst_[0]->Height();
+  const int grid_z = dst_[0]->Slices();
+  return int3(grid_x, grid_y, grid_z);
+}
+
+Status SpaceToDepth::Tune(const TuningParameters& params) {
+  RETURN_IF_ERROR(BindArguments());
+  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+}
+
+Status SpaceToDepth::AddToQueue(CLCommandQueue* queue) {
+  RETURN_IF_ERROR(BindArguments());
+  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+}
+
+SpaceToDepth CreateSpaceToDepth(const OperationDef& op_def,
+                                const SpaceToDepthAttributes& attr) {
+  return SpaceToDepth(op_def, attr);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h
new file mode 100644
index 00000000000..3d316569fcb
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h
@@ -0,0 +1,58 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_SPACE_TO_DEPTH_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_SPACE_TO_DEPTH_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class SpaceToDepth : public GPUOperation {
+ public:
+  SpaceToDepth(const OperationDef& op_def, const SpaceToDepthAttributes& attr)
+      : GPUOperation(op_def), attr_(attr), work_group_size_(8, 4, 1) {}
+  Status AddToQueue(CLCommandQueue* queue) override;
+  Status Tune(const TuningParameters& params) override;
+  Status Compile(const CreationContext& creation_context) override;
+
+  SpaceToDepth(SpaceToDepth&& operation);
+  SpaceToDepth& operator=(SpaceToDepth&& operation);
+  SpaceToDepth(const SpaceToDepth&) = delete;
+  SpaceToDepth& operator=(const SpaceToDepth&) = delete;
+
+ private:
+  Status BindArguments();
+  int3 GetGridSize() const;
+
+  SpaceToDepthAttributes attr_;
+  CLKernel kernel_;
+  int3 work_group_size_;
+};
+
+SpaceToDepth CreateSpaceToDepth(const OperationDef& op_def,
+                                const SpaceToDepthAttributes& attr);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_SPACE_TO_DEPTH_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth_test.cc
new file mode 100644
index 00000000000..02d93582ede
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth_test.cc
@@ -0,0 +1,144 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+/*
+// A known Qualcomm Adreno bug makes the 1 channel test fail on old devices.
+TEST_F(OpenCLOperationTest, SpaceToDepthTensorShape1x2x2x1BlockSize2) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 1);
+  src_tensor.data = {half(1.0f), half(2.0f), half(3.0f), half(4.0f)};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      SpaceToDepth operation = CreateSpaceToDepth(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 1, 1, 4), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(1e-6),
+                            {half(1.0f), half(2.0f), half(3.0f), half(4.0f)}));
+    }
+  }
+}
+*/
+
+TEST_F(OpenCLOperationTest, SpaceToDepthTensorShape1x2x2x2BlockSize2) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 2);
+  src_tensor.data = {half(1.4f), half(2.3f), half(3.2f), half(4.1f),
+                     half(5.4f), half(6.3f), half(7.2f), half(8.1f)};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      SpaceToDepth operation = CreateSpaceToDepth(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 1, 1, 8), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(1e-6),
+                            {half(1.4f), half(2.3f), half(3.2f), half(4.1f),
+                             half(5.4f), half(6.3f), half(7.2f), half(8.1f)}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, SpaceToDepthTensorShape1x2x2x3BlockSize2) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 2, 3);
+  src_tensor.data = {half(1.0f), half(2.0f),  half(3.0f),  half(4.0f),
+                     half(5.0f), half(6.0f),  half(7.0f),  half(8.0f),
+                     half(9.0f), half(10.0f), half(11.0f), half(12.0f)};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      SpaceToDepth operation = CreateSpaceToDepth(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 1, 1, 12), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(1e-6), {half(1.0f), half(2.0f), half(3.0f),  //
+                                      half(4.0f), half(5.0f), half(6.0f),  //
+                                      half(7.0f), half(8.0f), half(9.0f),  //
+                                      half(10.0f), half(11.0f), half(12.0f)}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, SpaceToDepthTensorShape1x4x4x1BlockSize2) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 4, 4, 1);
+  src_tensor.data = {half(1.0f),  half(2.0f),  half(5.0f),  half(6.0f),
+                     half(3.0f),  half(4.0f),  half(7.0f),  half(8.0f),
+                     half(9.0f),  half(10.0f), half(13.0f), half(14.0f),
+                     half(11.0f), half(12.0f), half(15.0f), half(16.0f)};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      SpaceToDepth operation = CreateSpaceToDepth(op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 2, 4), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(1e-6),
+                    {half(1.0f), half(2.0f), half(3.0f), half(4.0f),     //
+                     half(5.0f), half(6.0f), half(7.0f), half(8.0f),     //
+                     half(9.0f), half(10.0f), half(11.0f), half(12.0f),  //
+                     half(13.0f), half(14.0f), half(15.0f), half(16.0f)}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
index 3153d7ddfd8..2219a6b0c50 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
@@ -196,6 +196,10 @@ Status GPUOperationFromNode(const CreationContext& creation_context,
       SelectReshape(src_channels, attr.new_shape.c, op_def, gpu_op);
       return OkStatus();
     }
+    case OperationType::RESIZE: {
+      auto attr = absl::any_cast<Resize2DAttributes>(node.operation.attributes);
+      return SelectResize(attr, op_def, gpu_op);
+    }
     case OperationType::SLICE: {
       auto attr = absl::any_cast<SliceAttributes>(node.operation.attributes);
       SelectStridedSlice(attr, op_def, gpu_op);
@@ -205,16 +209,18 @@ Status GPUOperationFromNode(const CreationContext& creation_context,
       SelectSoftmax(inputs[0]->tensor.shape, op_def, gpu_op);
       return OkStatus();
     }
+    case OperationType::SPACE_TO_DEPTH: {
+      auto attr =
+          absl::any_cast<SpaceToDepthAttributes>(node.operation.attributes);
+      SelectSpaceToDepth(attr, op_def, gpu_op);
+      return OkStatus();
+    }
     case OperationType::TRANSPOSE: {
       auto attr =
           absl::any_cast<TransposeAttributes>(node.operation.attributes);
       SelectTranspose(attr, op_def, gpu_op);
       return OkStatus();
     }
-    case OperationType::RESIZE: {
-      auto attr = absl::any_cast<Resize2DAttributes>(node.operation.attributes);
-      return SelectResize(attr, op_def, gpu_op);
-    }
     case OperationType::ABS:
     case OperationType::COS:
     case OperationType::HARD_SWISH:
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
index 1dde6e514a8..92ef85b1779 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/kernels/resize.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/softmax.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/space_to_depth.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/transpose.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -125,6 +126,13 @@ void SelectReshape(int src_channels, int dst_channels,
   }
 }
 
+void SelectSpaceToDepth(const SpaceToDepthAttributes& attr,
+                        const OperationDef& op_def,
+                        std::unique_ptr<GPUOperation>* ptr) {
+  SpaceToDepth operation = CreateSpaceToDepth(op_def, attr);
+  *ptr = absl::make_unique<SpaceToDepth>(std::move(operation));
+}
+
 void SelectPadding(const PadAttributes& attr, const OperationDef& op_def,
                    std::unique_ptr<GPUOperation>* ptr) {
   Padding operation = CreatePadding(op_def, attr);
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
index a9cc7c2fe7b..5a830994f75 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
@@ -82,6 +82,10 @@ Status SelectBroadcastAdd(const AddAttributes& attr,
 void SelectSoftmax(const BHWC& shape, const OperationDef& op_def,
                    std::unique_ptr<GPUOperation>* ptr);
 
+void SelectSpaceToDepth(const SpaceToDepthAttributes& attr,
+                        const OperationDef& op_def,
+                        std::unique_ptr<GPUOperation>* ptr);
+
 void SelectTranspose(const TransposeAttributes& attr,
                      const OperationDef& op_def,
                      std::unique_ptr<GPUOperation>* ptr);
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 73d7e8821e8..18809789891 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -1872,50 +1872,6 @@ class Resize2DOperationParser : public TFLiteOperationParser {
   SamplingType sampling_type_ = SamplingType::UNKNOWN;
 };
 
-class SoftmaxOperationParser : public TFLiteOperationParser {
- public:
-  Status IsSupported(const TfLiteContext* context,
-                     const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
-    RETURN_IF_ERROR(
-        CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1));
-    TfLiteSoftmaxParams* tf_options = nullptr;
-    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
-    if (tf_options->beta != 1) {
-      // TODO(eignasheva): figure out, what's wrong with softmax.
-      return UnimplementedError("Softmax.beta != 1 is not supported.");
-    }
-    return OkStatus();
-  }
-
-  Status Parse(const TfLiteNode* tflite_node,
-               const TfLiteRegistration* registration, GraphFloat32* graph,
-               ObjectReader* reader) final {
-    Node* node = graph->NewNode();
-    node->operation.type = ToString(OperationType::SOFTMAX);
-    RETURN_IF_ERROR(reader->AddInput(node, 0));
-    RETURN_IF_ERROR(reader->AddOutputs(node));
-
-    const auto* tf_options =
-        reinterpret_cast<const TfLiteSoftmaxParams*>(tflite_node->builtin_data);
-    if (!tf_options) {
-      return InternalError("Missing tflite params");
-    }
-    if (tf_options->beta != 1) {
-      // there is multiply by scalar operation fused in softmax. Make a layer
-      // out of it before softmax.
-      return UnimplementedError("Softmax.beta != 1 is not supported.");
-      // auto mul_node = reader->NewPassthroughNode(node);
-      // mul_node->operation.type = ToString(OperationType::MUL);
-    }
-    SoftmaxAttributes attr;
-    attr.axis = Axis::CHANNELS;  // always by channels
-    node->operation.attributes = attr;
-    return OkStatus();
-  }
-};
-
 class SliceOperationParser : public TFLiteOperationParser {
  public:
   Status IsSupported(const TfLiteContext* context,
@@ -1995,6 +1951,86 @@ class SliceOperationParser : public TFLiteOperationParser {
   }
 };
 
+class SoftmaxOperationParser : public TFLiteOperationParser {
+ public:
+  Status IsSupported(const TfLiteContext* context,
+                     const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
+    RETURN_IF_ERROR(
+        CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1));
+    TfLiteSoftmaxParams* tf_options = nullptr;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
+    if (tf_options->beta != 1) {
+      // TODO(eignasheva): figure out, what's wrong with softmax.
+      return UnimplementedError("Softmax.beta != 1 is not supported.");
+    }
+    return OkStatus();
+  }
+
+  Status Parse(const TfLiteNode* tflite_node,
+               const TfLiteRegistration* registration, GraphFloat32* graph,
+               ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::SOFTMAX);
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+
+    const auto* tf_options =
+        reinterpret_cast<const TfLiteSoftmaxParams*>(tflite_node->builtin_data);
+    if (!tf_options) {
+      return InternalError("Missing tflite params");
+    }
+    if (tf_options->beta != 1) {
+      // there is multiply by scalar operation fused in softmax. Make a layer
+      // out of it before softmax.
+      return UnimplementedError("Softmax.beta != 1 is not supported.");
+      // auto mul_node = reader->NewPassthroughNode(node);
+      // mul_node->operation.type = ToString(OperationType::MUL);
+    }
+    SoftmaxAttributes attr;
+    attr.axis = Axis::CHANNELS;  // always by channels
+    node->operation.attributes = attr;
+    return OkStatus();
+  }
+};
+
+class SpaceToDepthOperationParser : public TFLiteOperationParser {
+ public:
+  Status IsSupported(const TfLiteContext* context,
+                     const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1));
+    RETURN_IF_ERROR(
+        CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1));
+    // TODO(impjdi): Dims check.
+    TfLiteSpaceToDepthParams* s2d_params = nullptr;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &s2d_params));
+    if (s2d_params->block_size == 1) {
+      return InvalidArgumentError("SPACE_TO_DEPTH block_size = 1 is a no-op.");
+    }
+    if (s2d_params->block_size < 1) {
+      return InvalidArgumentError("SPACE_TO_DEPTH block_size must be > 1.");
+    }
+    return OkStatus();
+  }
+
+  Status Parse(const TfLiteNode* tflite_node,
+               const TfLiteRegistration* registration, GraphFloat32* graph,
+               ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::SPACE_TO_DEPTH);
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+    const auto* tf_options = reinterpret_cast<const TfLiteSpaceToDepthParams*>(
+        tflite_node->builtin_data);
+    SpaceToDepthAttributes attr;
+    attr.block_size = tf_options->block_size;
+    node->operation.attributes = attr;
+    return OkStatus();
+  }
+};
+
 class StridedSliceOperationParser : public TFLiteOperationParser {
  public:
   Status IsSupported(const TfLiteContext* context,
@@ -2651,12 +2687,12 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
           OperationType::RSQRT);
     case kTfLiteBuiltinSin:
       return absl::make_unique<ElementwiseOperationParser>(OperationType::SIN);
-    case kTfLiteBuiltinSoftmax:
-      return absl::make_unique<SoftmaxOperationParser>();
     case kTfLiteBuiltinSlice:
       return absl::make_unique<SliceOperationParser>();
-    case kTfLiteBuiltinStridedSlice:
-      return absl::make_unique<StridedSliceOperationParser>();
+    case kTfLiteBuiltinSoftmax:
+      return absl::make_unique<SoftmaxOperationParser>();
+    case kTfLiteBuiltinSpaceToDepth:
+      return absl::make_unique<SpaceToDepthOperationParser>();
     case kTfLiteBuiltinSqrt:
       return absl::make_unique<ElementwiseOperationParser>(OperationType::SQRT);
     case kTfLiteBuiltinSquare:
@@ -2665,6 +2701,8 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
     case kTfLiteBuiltinSquaredDifference:
       return absl::make_unique<ElementwiseOperationParser>(
           OperationType::SQUARED_DIFF);
+    case kTfLiteBuiltinStridedSlice:
+      return absl::make_unique<StridedSliceOperationParser>();
     case kTfLiteBuiltinSub:
       return absl::make_unique<ElementwiseOperationParser>(OperationType::SUB);
     case kTfLiteBuiltinTanh:
diff --git a/tensorflow/lite/delegates/gpu/common/operations.cc b/tensorflow/lite/delegates/gpu/common/operations.cc
index 0d5c3429a49..e2add0d569f 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.cc
+++ b/tensorflow/lite/delegates/gpu/common/operations.cc
@@ -134,6 +134,8 @@ std::string ToString(enum OperationType op) {
       return "softmax";
     case OperationType::SPACE_TO_BATCH:
       return "space_to_batch";
+    case OperationType::SPACE_TO_DEPTH:
+      return "space_to_depth";
     case OperationType::SQRT:
       return "sqrt";
     case OperationType::SQUARE:
@@ -186,6 +188,7 @@ OperationType OperationTypeFromString(const std::string& name) {
           {"sin", OperationType::SIN},
           {"slice", OperationType::SLICE},
           {"softmax", OperationType::SOFTMAX},
+          {"space_to_depth", OperationType::SPACE_TO_DEPTH},
           {"sqrt", OperationType::SQRT},
           {"square", OperationType::SQUARE},
           {"squared_diff", OperationType::SQUARED_DIFF},
diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h
index 87bb3ec383f..05447629c23 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@@ -65,6 +65,7 @@ enum class OperationType {
   SLICE,
   SOFTMAX,
   SPACE_TO_BATCH,
+  SPACE_TO_DEPTH,
   SQRT,
   SQUARE,
   SQUARED_DIFF,
@@ -472,6 +473,10 @@ struct TransposeAttributes {
 // the given input.
 BHWC CalculateOutputShape(const BHWC& input, const TransposeAttributes& attr);
 
+struct SpaceToDepthAttributes {
+  int block_size;
+};
+
 }  // namespace gpu
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
index 755141fbb37..68ae9dfd4dc 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
@@ -569,6 +569,35 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "space_to_depth",
+    srcs = ["space_to_depth.cc"],
+    hdrs = ["space_to_depth.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/gl:node_shader",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:any",
+    ],
+)
+
+cc_test(
+    name = "space_to_depth_test",
+    srcs = ["space_to_depth_test.cc"],
+    tags = tf_gpu_tests_tags() + [
+        "notap",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":space_to_depth",
+        ":test_util",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "test_util",
     testonly = 1,
@@ -676,6 +705,7 @@ TFLITE_GPU_BINARY_RELEASE_OPERATORS = [
     "resize",
     "slice",
     "softmax",
+    "space_to_depth",
     "transpose_conv",
 ]
 
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.cc b/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.cc
new file mode 100644
index 00000000000..1d49da0e3fa
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.cc
@@ -0,0 +1,74 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.h"
+
+#include <string>
+#include <utility>
+
+#include "absl/memory/memory.h"
+#include "absl/types/any.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+namespace {
+
+class SpaceToDepth : public NodeShader {
+ public:
+  Status GenerateCode(const GenerationContext& ctx,
+                      GeneratedCode* generated_code) const final {
+    const auto attr =
+        absl::any_cast<SpaceToDepthAttributes>(ctx.node->operation.attributes);
+    const auto& input_data_0 = ctx.graph->FindInputs(ctx.node->id)[0]->tensor;
+    std::string code = R"(
+      for (int i = 0; i < 4; ++i) {
+        int dst_c = 4 * gid.z + i;
+        int block_id = dst_c / $input_data_0_c$;
+        int src_x = gid.x * $block_size$ + block_id % $block_size$;
+        int src_y = gid.y * $block_size$ + block_id / $block_size$;
+        int src_c = dst_c % $input_data_0_c$;
+        value_0[i] = $input_data_0[src_x, src_y, src_c / 4]$[src_c % 4];
+      }
+    )";
+
+    *generated_code = {
+        /*parameters=*/{
+            {"block_size", attr.block_size},
+            {"input_data_0_c", input_data_0.shape.c},
+        },
+        /*objects=*/{},
+        /*shared_variables=*/{},
+        /*workload=*/uint3(),
+        /*workgroup=*/uint3(),
+        /*source_code=*/std::move(code),
+        /*input=*/IOStructure::ONLY_DEFINITIONS,
+        /*output=*/IOStructure::AUTO,
+    };
+    return OkStatus();
+  }
+};
+}  // namespace
+
+std::unique_ptr<NodeShader> NewSpaceToDepthNodeShader() {
+  return absl::make_unique<SpaceToDepth>();
+}
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.h b/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.h
new file mode 100644
index 00000000000..3c52ef4eba7
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_SPACE_TO_DEPTH_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_SPACE_TO_DEPTH_H_
+
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/gl/node_shader.h"
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+
+std::unique_ptr<NodeShader> NewSpaceToDepthNodeShader();
+
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_GL_KERNELS_SPACE_TO_DEPTH_H_
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth_test.cc
new file mode 100644
index 00000000000..0ff132b8147
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth_test.cc
@@ -0,0 +1,104 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/gl/kernels/space_to_depth.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/gl/kernels/test_util.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace gl {
+namespace {
+
+TEST(SpaceToDepthTest, TensorShape1x2x2x1BlockSize2) {
+  const TensorRef<BHWC> input = {
+      .type = DataType::FLOAT32, .shape = BHWC(1, 2, 2, 1), .ref = 0};
+  const TensorRef<BHWC> output = {
+      .type = DataType::FLOAT32, .shape = BHWC(1, 1, 1, 4), .ref = 1};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  SingleOpModel model({ToString(OperationType::SPACE_TO_DEPTH), attr}, {input},
+                      {output});
+  ASSERT_TRUE(model.PopulateTensor(0, {1.0f, 2.0f, 3.0f, 4.0f}));
+  ASSERT_OK(model.Invoke(*NewSpaceToDepthNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {1.0f, 2.0f, 3.0f, 4.0f}));
+}
+
+TEST(SpaceToDepthTest, TensorShape1x2x2x2BlockSize2) {
+  const TensorRef<BHWC> input = {
+      .type = DataType::FLOAT32, .shape = BHWC(1, 2, 2, 2), .ref = 0};
+  const TensorRef<BHWC> output = {
+      .type = DataType::FLOAT32, .shape = BHWC(1, 1, 1, 8), .ref = 1};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  SingleOpModel model({ToString(OperationType::SPACE_TO_DEPTH), attr}, {input},
+                      {output});
+  ASSERT_TRUE(model.PopulateTensor(
+      0, {1.4f, 2.3f, 3.2f, 4.1f, 5.4f, 6.3f, 7.2f, 8.1f}));
+  ASSERT_OK(model.Invoke(*NewSpaceToDepthNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6),
+                        {1.4f, 2.3f, 3.2f, 4.1f, 5.4f, 6.3f, 7.2f, 8.1f}));
+}
+
+TEST(SpaceToDepthTest, TensorShape1x2x2x3BlockSize2) {
+  const TensorRef<BHWC> input = {
+      .type = DataType::FLOAT32, .shape = BHWC(1, 2, 2, 3), .ref = 0};
+  const TensorRef<BHWC> output = {
+      .type = DataType::FLOAT32, .shape = BHWC(1, 1, 1, 12), .ref = 1};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  SingleOpModel model({ToString(OperationType::SPACE_TO_DEPTH), attr}, {input},
+                      {output});
+  ASSERT_TRUE(model.PopulateTensor(0, {1.0f, 2.0f, 3.0f,  //
+                                       4.0f, 5.0f, 6.0f,  //
+                                       7.0f, 8.0f, 9.0f,  //
+                                       10.0f, 11.0f, 12.0f}));
+  ASSERT_OK(model.Invoke(*NewSpaceToDepthNodeShader()));
+  EXPECT_THAT(
+      model.GetOutput(0),
+      Pointwise(FloatNear(1e-6), {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f,  //
+                                  7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f}));
+}
+
+TEST(SpaceToDepthTest, TensorShape1x4x4x1BlockSize2) {
+  const TensorRef<BHWC> input = {
+      .type = DataType::FLOAT32, .shape = BHWC(1, 4, 4, 1), .ref = 0};
+  const TensorRef<BHWC> output = {
+      .type = DataType::FLOAT32, .shape = BHWC(1, 2, 2, 4), .ref = 1};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  SingleOpModel model({ToString(OperationType::SPACE_TO_DEPTH), attr}, {input},
+                      {output});
+  ASSERT_TRUE(model.PopulateTensor(0, {1.0, 2.0, 5.0, 6.0,     //
+                                       3.0, 4.0, 7.0, 8.0,     //
+                                       9.0, 10.0, 13.0, 14.0,  //
+                                       11.0, 12.0, 15.0, 16.0}));
+  ASSERT_OK(model.Invoke(*NewSpaceToDepthNodeShader()));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {1.0, 2.0, 3.0, 4.0,     //
+                                          5.0, 6.0, 7.0, 8.0,     //
+                                          9.0, 10.0, 11.0, 12.0,  //
+                                          13.0, 14.0, 15.0, 16.0}));
+}
+}  // namespace
+}  // namespace gl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/metal/api.cc b/tensorflow/lite/delegates/gpu/metal/api.cc
index 802697ee9a9..bfbd2410fee 100644
--- a/tensorflow/lite/delegates/gpu/metal/api.cc
+++ b/tensorflow/lite/delegates/gpu/metal/api.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/metal/kernels/resize.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/slice.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/softmax.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/transpose_conv.h"
 #include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
 
@@ -137,6 +138,12 @@ std::vector<ComputeTaskDescriptorPtr> SelectSoftmax(const GraphFloat32& graph,
   }
 }
 
+std::vector<ComputeTaskDescriptorPtr> SelectSpaceToDepth(
+    const GraphFloat32& graph, int id, ValueId input_id, ValueId output_id,
+    const SpaceToDepthAttributes& attr) {
+  return SpaceToDepth(id, input_id, output_id, attr);
+}
+
 Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
                           const std::vector<ValueId>& inputs,
                           const std::vector<ValueId>& outputs,
@@ -254,6 +261,11 @@ Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
       *tasks = SelectSoftmax(graph, node_id, inputs[0], outputs[0]);
       break;
     }
+    case OperationType::SPACE_TO_DEPTH:
+      *tasks = SelectSpaceToDepth(
+          graph, node_id, inputs[0], outputs[0],
+          absl::any_cast<SpaceToDepthAttributes>(node->operation.attributes));
+      break;
     case OperationType::ABS:
     case OperationType::COS:
     case OperationType::HARD_SWISH:
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
index ada4b7c04ed..acef28e332d 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@@ -30,6 +30,7 @@ cc_library(
         ":resize",
         ":slice",
         ":softmax",
+        ":space_to_depth",
         ":transpose_conv",
     ],
 )
@@ -698,6 +699,42 @@ ios_unit_test(
     deps = [":softmax_test_lib"],
 )
 
+cc_library(
+    name = "space_to_depth",
+    srcs = ["space_to_depth.cc"],
+    hdrs = ["space_to_depth.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:util",
+        "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
+        "//tensorflow/lite/delegates/gpu/metal:runtime_options",
+        "//tensorflow/lite/delegates/gpu/metal/kernels:util",
+    ],
+)
+
+objc_library(
+    name = "space_to_depth_test_lib",
+    testonly = 1,
+    srcs = ["space_to_depth_test.mm"],
+    sdk_frameworks = ["XCTest"],
+    deps = [
+        ":space_to_depth",
+        ":test_util",
+    ],
+)
+
+ios_unit_test(
+    name = "space_to_depth_test",
+    testonly = 1,
+    minimum_os_version = "10.0",
+    tags = tf_gpu_tests_tags() + [
+        "notap",
+        "tflite_not_portable_android",
+    ],
+    deps = [":space_to_depth_test_lib"],
+)
+
 cc_library(
     name = "transpose_conv",
     srcs = ["transpose_conv.cc"],
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.cc b/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.cc
new file mode 100644
index 00000000000..3614174ef11
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.cc
@@ -0,0 +1,129 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.h"
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+std::vector<ComputeTaskDescriptorPtr> SpaceToDepth(
+    int id, ValueId input_id, ValueId output_id,
+    const SpaceToDepthAttributes& attr) {
+  auto desc = std::make_shared<ComputeTaskDescriptor>();
+  desc->id = id;
+  desc->is_linkable = false;
+  desc->shader_source = R"(
+#include <metal_stdlib>
+using namespace metal;
+struct uniforms {
+  uint4 src_size;
+  uint4 dst_size;
+  uint4 block_size;
+};
+$0
+kernel void ComputeFunction($1 uint3 gid[[thread_position_in_grid]]) {
+  uint3 src_size = (uint3)(params.src_size.xyz);
+  uint3 dst_size = (uint3)(params.dst_size.xyz);
+  uint block_size = (uint)(params.block_size.x);
+  if (gid.x >= dst_size.x || gid.y >= dst_size.y || gid.z * 4 >= dst_size.z) {
+    return;
+  }
+  FLT4 value;
+  for (uint i = 0; i < 4; ++i) {
+    uint dst_c = 4 * gid.z + i;
+    uint block_id = dst_c / src_size.z;
+    uint src_x = gid.x * block_size + block_id % block_size;
+    uint src_y = gid.y * block_size + block_id / block_size;
+    uint src_c = dst_c % src_size.z;
+    value[i] =
+        src_buffer[src_x + src_size.x * (src_y + src_size.y * (src_c / 4))]
+                  [src_c % 4];
+  }
+  $2
+  dst_buffer[gid.x + dst_size.x * (gid.y + dst_size.y * gid.z)] = value;
+})";
+
+  desc->input_buffers = {{input_id, "device FLT4* const src_buffer"}};
+
+  desc->output_buffer = {
+      output_id, "device FLT4* dst_buffer",
+      [input_id, attr](const std::map<ValueId, BHWC>& buffers) -> BHWC {
+        const BHWC& input_shape = buffers.find(input_id)->second;
+        return BHWC(input_shape.b,  //
+                    input_shape.h / attr.block_size,
+                    input_shape.w / attr.block_size,
+                    input_shape.c * attr.block_size * attr.block_size);
+      }};
+
+  desc->uniform_buffers = {
+      {"constant uniforms& params",
+       [input_id, output_id, attr](const std::map<ValueId, BHWC>& buffers) {
+         const BHWC& input_shape = buffers.find(input_id)->second;
+         const BHWC& output_shape = buffers.find(output_id)->second;
+         const std::vector<int> uniform_params = {
+             // src_size
+             input_shape.w,
+             input_shape.h,
+             input_shape.c,
+             0,
+             // dst_size
+             output_shape.w,
+             output_shape.h,
+             output_shape.c,
+             0,
+             // block_size
+             attr.block_size,
+             0,
+             0,
+             0,
+         };
+         return GetByteBuffer(uniform_params);
+       }},
+  };
+
+  desc->resize_function =
+      [input_id, attr](
+          const std::map<ValueId, BHWC>& buffers) -> std::pair<uint3, uint3> {
+    const BHWC& input_shape = buffers.find(input_id)->second;
+    const BHWC output_shape(input_shape.b,  //
+                            input_shape.h / attr.block_size,
+                            input_shape.w / attr.block_size,
+                            input_shape.c * attr.block_size * attr.block_size);
+    const uint3 grid = uint3(output_shape.w, output_shape.h,
+                             IntegralDivideRoundUp(output_shape.c, 4));
+    const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
+    const int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
+    const int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
+    const int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
+    return std::make_pair(groups_size, uint3(groups_x, groups_y, groups_z));
+  };
+  return {desc};
+}
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.h b/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.h
new file mode 100644
index 00000000000..c46a2dfbaab
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.h
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_SPACE_TO_DEPTH_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_SPACE_TO_DEPTH_H_
+
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+std::vector<ComputeTaskDescriptorPtr> SpaceToDepth(
+    int id, ValueId input_id, ValueId output_id,
+    const SpaceToDepthAttributes& attr);
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_SPACE_TO_DEPTH_H_
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth_test.mm
new file mode 100644
index 00000000000..6e82ebe0361
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth_test.mm
@@ -0,0 +1,153 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/metal/kernels/space_to_depth.h"
+
+#import <XCTest/XCTest.h>
+
+#include <cmath>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h"
+#include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
+
+using ::tflite::gpu::BHWC;
+using ::tflite::gpu::DataType;
+using ::tflite::gpu::OperationType;
+using ::tflite::gpu::SpaceToDepthAttributes;
+using ::tflite::gpu::TensorRef;
+using ::tflite::gpu::metal::CompareVectors;
+using ::tflite::gpu::metal::SingleOpModel;
+
+@interface SpaceToDepthTest : XCTestCase
+@end
+
+@implementation SpaceToDepthTest
+
+- (void)testTensorShape1x2x2x1BlockSize2 {
+  const TensorRef<BHWC> input = {.type = DataType::FLOAT32, .shape = BHWC(1, 2, 2, 1), .ref = 0};
+  const TensorRef<BHWC> output = {.type = DataType::FLOAT32, .shape = BHWC(1, 1, 1, 4), .ref = 1};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  SingleOpModel model({ToString(OperationType::SPACE_TO_DEPTH), attr}, {input}, {output});
+  if (!model.PopulateTensor(0, {1.0f, 2.0f, 3.0f, 4.0f})) {
+    XCTFail(@"PopulateTensor()");
+  }
+  const auto status = model.Invoke();
+  if (!status.ok()) XCTFail(@"%s", status.error_message().c_str());
+  const std::vector<float>& actual = model.GetOutput(0);
+  const std::vector<float> expected = {1.0f, 2.0f, 3.0f, 4.0f};
+  XCTAssertEqual(actual[0], expected[0]);
+  XCTAssertEqual(actual[1], expected[1]);
+  XCTAssertEqual(actual[2], expected[2]);
+  XCTAssertEqual(actual[3], expected[3]);
+}
+
+- (void)testTensorShape1x2x2x2BlockSize2 {
+  const TensorRef<BHWC> input = {.type = DataType::FLOAT32, .shape = BHWC(1, 2, 2, 2), .ref = 0};
+  const TensorRef<BHWC> output = {.type = DataType::FLOAT32, .shape = BHWC(1, 1, 1, 8), .ref = 1};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  SingleOpModel model({ToString(OperationType::SPACE_TO_DEPTH), attr}, {input}, {output});
+  if (!model.PopulateTensor(0, {1.4f, 2.3f, 3.2f, 4.1f, 5.4f, 6.3f, 7.2f, 8.1f})) {
+    XCTFail(@"PopulateTensor()");
+  }
+  const auto status = model.Invoke();
+  if (!status.ok()) XCTFail(@"%s", status.error_message().c_str());
+  const std::vector<float>& actual = model.GetOutput(0);
+  const std::vector<float> expected = {1.4f, 2.3f, 3.2f, 4.1f, 5.4f, 6.3f, 7.2f, 8.1f};
+  XCTAssertEqual(actual[0], expected[0]);
+  XCTAssertEqual(actual[1], expected[1]);
+  XCTAssertEqual(actual[2], expected[2]);
+  XCTAssertEqual(actual[3], expected[3]);
+  XCTAssertEqual(actual[4], expected[4]);
+  XCTAssertEqual(actual[5], expected[5]);
+  XCTAssertEqual(actual[6], expected[6]);
+  XCTAssertEqual(actual[7], expected[7]);
+}
+
+- (void)testTensorShape1x2x2x3BlockSize2 {
+  const TensorRef<BHWC> input = {.type = DataType::FLOAT32, .shape = BHWC(1, 2, 2, 3), .ref = 0};
+  const TensorRef<BHWC> output = {.type = DataType::FLOAT32, .shape = BHWC(1, 1, 1, 12), .ref = 1};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  SingleOpModel model({ToString(OperationType::SPACE_TO_DEPTH), attr}, {input}, {output});
+  if (!model.PopulateTensor(0, {1.0f, 2.0f, 3.0f,  //
+                                4.0f, 5.0f, 6.0f,  //
+                                7.0f, 8.0f, 9.0f,  //
+                                10.0f, 11.0f, 12.0f})) {
+    XCTFail(@"PopulateTensor()");
+  }
+  const auto status = model.Invoke();
+  if (!status.ok()) XCTFail(@"%s", status.error_message().c_str());
+  const std::vector<float>& actual = model.GetOutput(0);
+  const std::vector<float> expected = {1.0f,  2.0f,  3.0f,  //
+                                       4.0f,  5.0f,  6.0f,  //
+                                       7.0f,  8.0f,  9.0f,  //
+                                       10.0f, 11.0f, 12.0f};
+  XCTAssertEqual(actual[0], expected[0]);
+  XCTAssertEqual(actual[1], expected[1]);
+  XCTAssertEqual(actual[2], expected[2]);
+  XCTAssertEqual(actual[3], expected[3]);
+  XCTAssertEqual(actual[4], expected[4]);
+  XCTAssertEqual(actual[5], expected[5]);
+  XCTAssertEqual(actual[6], expected[6]);
+  XCTAssertEqual(actual[7], expected[7]);
+  XCTAssertEqual(actual[8], expected[8]);
+  XCTAssertEqual(actual[9], expected[9]);
+  XCTAssertEqual(actual[10], expected[10]);
+  XCTAssertEqual(actual[11], expected[11]);
+}
+
+- (void)testTensorShape1x4x4x1BlockSize2 {
+  const TensorRef<BHWC> input = {.type = DataType::FLOAT32, .shape = BHWC(1, 4, 4, 1), .ref = 0};
+  const TensorRef<BHWC> output = {.type = DataType::FLOAT32, .shape = BHWC(1, 2, 2, 4), .ref = 1};
+  const SpaceToDepthAttributes attr = {.block_size = 2};
+  SingleOpModel model({ToString(OperationType::SPACE_TO_DEPTH), attr}, {input}, {output});
+  if (!model.PopulateTensor(0, {1.0f, 2.0f, 5.0f, 6.0f,     //
+                                3.0f, 4.0f, 7.0f, 8.0f,     //
+                                9.0f, 10.0f, 13.0f, 14.0f,  //
+                                11.0f, 12.0f, 15.0f, 16.0f})) {
+    XCTFail(@"PopulateTensor()");
+  }
+  const auto status = model.Invoke();
+  if (!status.ok()) XCTFail(@"%s", status.error_message().c_str());
+  const std::vector<float>& actual = model.GetOutput(0);
+  const std::vector<float> expected = {1.0f,  2.0f,  3.0f,  4.0f,   //
+                                       5.0f,  6.0f,  7.0f,  8.0f,   //
+                                       9.0f,  10.0f, 11.0f, 12.0f,  //
+                                       13.0f, 14.0f, 15.0f, 16.0f};
+  XCTAssertEqual(actual[0], expected[0]);
+  XCTAssertEqual(actual[1], expected[1]);
+  XCTAssertEqual(actual[2], expected[2]);
+  XCTAssertEqual(actual[3], expected[3]);
+  XCTAssertEqual(actual[4], expected[4]);
+  XCTAssertEqual(actual[5], expected[5]);
+  XCTAssertEqual(actual[6], expected[6]);
+  XCTAssertEqual(actual[7], expected[7]);
+  XCTAssertEqual(actual[8], expected[8]);
+  XCTAssertEqual(actual[9], expected[9]);
+  XCTAssertEqual(actual[10], expected[10]);
+  XCTAssertEqual(actual[11], expected[11]);
+  XCTAssertEqual(actual[12], expected[12]);
+  XCTAssertEqual(actual[13], expected[13]);
+  XCTAssertEqual(actual[14], expected[14]);
+  XCTAssertEqual(actual[15], expected[15]);
+}
+
+@end

From 5a037e9954a9e9c883acc504a20ecb4a10c131d9 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Thu, 20 Feb 2020 16:16:07 -0800
Subject: [PATCH 402/442] Fix typo in validation of beta parameter in XNNPACK
 Softmax

PiperOrigin-RevId: 296321710
Change-Id: I3449cf2809ddb72bc17081725797b0699d33d8a3
---
 tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index e6574cb7d3b..8f52d0a0be0 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -1022,7 +1022,7 @@ class Subgraph {
       TfLiteNode* node, const TfLiteTensor* tensors,
       const TfLiteSoftmaxParams* params,
       const std::vector<uint32_t>& xnnpack_tensors) {
-    if (params->beta == 1.0f) {
+    if (params->beta != 1.0f) {
       if (logging_context != nullptr) {
         logging_context->ReportError(
             logging_context, "unsupported beta value %.7f in SOFTMAX node #%d",

From 9d6347bbfe516b57439a250547f48d9b02c0f408 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Thu, 20 Feb 2020 16:25:08 -0800
Subject: [PATCH 403/442] Temporarily disable framework_memory_checker_test
 with tsan and msan

PiperOrigin-RevId: 296323364
Change-Id: I2f98fa7716925140b69ba3cb2c5cb4d94fd57cdf
---
 tensorflow/python/BUILD | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 25fc7c199a1..8126e9932fe 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -2558,6 +2558,8 @@ tf_py_test(
         "no_oss",
         "no_pip",
         "no_windows",
+        "nomsan",  # TODO(b/149948895): Re-enable.
+        "notsan",  # TODO(b/149948895): Re-enable.
     ],
     deps = [
         ":framework_test_lib",

From f367b121874c64765db10c505e9bb615397848ac Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Thu, 20 Feb 2020 16:26:53 -0800
Subject: [PATCH 404/442] Add dataset_from_directory utility.

PiperOrigin-RevId: 296323743
Change-Id: Ia6a6824e8f9bd45d7ac726e7c21477f8d85540ab
---
 tensorflow/python/keras/preprocessing/BUILD   |  16 +
 .../keras/preprocessing/image_pipeline.py     | 329 ++++++++++++++++++
 .../preprocessing/image_pipeline_test.py      | 286 +++++++++++++++
 3 files changed, 631 insertions(+)
 create mode 100644 tensorflow/python/keras/preprocessing/image_pipeline.py
 create mode 100644 tensorflow/python/keras/preprocessing/image_pipeline_test.py

diff --git a/tensorflow/python/keras/preprocessing/BUILD b/tensorflow/python/keras/preprocessing/BUILD
index ff78af29f74..640e47a1d44 100644
--- a/tensorflow/python/keras/preprocessing/BUILD
+++ b/tensorflow/python/keras/preprocessing/BUILD
@@ -26,10 +26,12 @@ py_library(
     name = "image",
     srcs = [
         "image.py",
+        "image_pipeline.py",
     ],
     deps = [
         "//tensorflow/python:util",
         "//tensorflow/python/keras:backend",
+        "//tensorflow/python/keras/layers/preprocessing:image_preprocessing",
         "//tensorflow/python/keras/utils:data_utils",
     ],
 )
@@ -65,6 +67,20 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "image_pipeline_test",
+    size = "small",
+    srcs = ["image_pipeline_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":image",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/keras",
+        "//third_party/py/numpy",
+    ],
+)
+
 tf_py_test(
     name = "sequence_test",
     size = "small",
diff --git a/tensorflow/python/keras/preprocessing/image_pipeline.py b/tensorflow/python/keras/preprocessing/image_pipeline.py
new file mode 100644
index 00000000000..9cd8ed9d122
--- /dev/null
+++ b/tensorflow/python/keras/preprocessing/image_pipeline.py
@@ -0,0 +1,329 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras image dataset loading utilities."""
+# pylint: disable=g-classes-have-attributes
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import multiprocessing
+import os
+
+import numpy as np
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.keras.layers.preprocessing import image_preprocessing
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import image_ops
+from tensorflow.python.ops import io_ops
+from tensorflow.python.ops import math_ops
+
+
+WHITELIST_FORMATS = ('.bmp', '.gif', '.jpeg', '.jpg', '.png')
+
+
+def dataset_from_directory(directory,
+                           labels='inferred',
+                           label_mode='int',
+                           class_names=None,
+                           color_mode='rgb',
+                           batch_size=32,
+                           image_size=(256, 256),
+                           shuffle=True,
+                           seed=None,
+                           follow_links=False,
+                           validation_split=None,
+                           subset=None,
+                           interpolation='bilinear'):
+  """Generates a Dataset from image files in a directory.
+
+  If your directory structure is:
+
+  ```
+  main_directory/
+  ...class_a/
+  ......a_image_1.jpg
+  ......a_image_2.jpg
+  ...class_b/
+  ......b_image_1.jpg
+  ......b_image_2.jpg
+  ```
+
+  Then calling `from_directory(main_directory, labels='inferred')`
+  will return a Dataset that yields batches of images from
+  the subdirectories `class_a` and `class_b`, together with labels
+  0 and 1 (0 corresponding to class_a and 1 corresponding to class_b).
+
+  Supported image formats: jpeg, png, bmp, gif.
+  Animated gifs are truncated to the first frame.
+
+  Arguments:
+    directory: Directory where the data is located.
+        If `labels` is "inferred", it should contain
+        subdirectories, each containing images for a class.
+        Otherwise, the directory structure is ignored.
+    labels: Either "inferred"
+        (labels are generated from the directory structure),
+        or a list/tuple of integer labels of the same size as the number of
+        image files found in the directory. Labels should be sorted according
+        to the alphanumeric order of the image file paths
+        (obtained via `os.walk(directory)` in Python).
+    label_mode:
+        - 'int': means that the labels are encoded as integers
+            (e.g. for `sparse_categorical_crossentropy` loss).
+        - 'categorical' means that the labels are
+            encoded as a categorical vector
+            (e.g. for `categorical_crossentropy` loss).
+        - 'binary' means that the labels (there can be only 2)
+            are encoded as `float32` scalars with values 0 or 1
+            (e.g. for `binary_crossentropy`).
+        - None (no labels).
+    class_names: Only valid if "labels" is "inferred". This is the explict
+        list of class names (must match names of subdirectories). Used
+        to control the order of the classes
+        (otherwise alphanumerical order is used).
+    color_mode: One of "grayscale", "rgb", "rgba". Default: "rgb".
+        Whether the images will be converted to
+        have 1, 3, or 4 channels.
+    batch_size: Size of the batches of data. Default: 32.
+    image_size: Size to resize images to after they are read from disk.
+        Defaults to `(256, 256)`.
+        Since the pipeline processes batches of images that must all have
+        the same size, this must be provided.
+    shuffle: Whether to shuffle the data. Default: True.
+        If set to False, sorts the data in alphanumeric order.
+    seed: Optional random seed for shuffling and transformations.
+    follow_links: Whether to visits subdirectories pointed to by symlinks.
+        Defaults to False.
+    validation_split: Optional float between 0 and 1,
+        fraction of data to reserve for validation.
+    subset: One of "training" or "validation".
+        Only used if `validation_split` is set.
+    interpolation: String, the interpolation method sed when resizing images.
+      Defaults to `bilinear`. Supports `bilinear`, `nearest`, `bicubic`,
+      `area`, `lanczos3`, `lanczos5`, `gaussian`, `mitchellcubic`.
+
+  Returns:
+    A `tf.data.Dataset` object.
+      - If `label_mode` is None, it yields `float32` tensors of shape
+        `(batch_size, image_size[0], image_size[1], num_channels)`,
+        encoding images (see below for rules regarding `num_channels`).
+      - Otherwise, it yields a tuple `(images, labels)`, where `images`
+        has shape `(batch_size, image_size[0], image_size[1], num_channels)`,
+        and `labels` follows the format described below.
+
+    Rules regarding labels format:
+      - if `label_mode` is `int`, the labels are an `int32` tensor of shape
+        `(batch_size,)`.
+      - if `label_mode` is `binary`, the labels are a `float32` tensor of
+        1s and 0s of shape `(batch_size, 1)`.
+      - if `label_mode` is `categorial`, the labels are a `float32` tensor
+        of shape `(batch_size, num_classes)`, representing a one-hot
+        encoding of the class index.
+
+    Rules regarding number of channels in the yielded images:
+      - if `color_mode` is `grayscale`,
+        there's 1 channel in the image tensors.
+      - if `color_mode` is `rgb`,
+        there are 3 channel in the image tensors.
+      - if `color_mode` is `rgba`,
+        there are 4 channel in the image tensors.
+  """
+  if labels != 'inferred':
+    if not isinstance(labels, (list, tuple)):
+      raise ValueError(
+          '`labels` argument should be a list/tuple of integer labels, of '
+          'the same size as the number of image files in the target '
+          'directory. If you wish to infer the labels from the subdirectory '
+          'names in the target directory, pass `labels="inferred"`. '
+          'If you wish to get a dataset that only contains images '
+          '(no labels), pass `labels=None`.')
+    if class_names:
+      raise ValueError('You can only pass `class_names` if the labels are '
+                       'inferred from the subdirectory names in the target '
+                       'directory (`labels="inferred"`).')
+  if label_mode not in {'int', 'categorical', 'binary', None}:
+    raise ValueError(
+        '`label_mode` argument must be one of "int", "categorical", "binary", '
+        'or None. Received: %s' % (label_mode,))
+  if color_mode == 'rgb':
+    num_channels = 3
+  elif color_mode == 'rgba':
+    num_channels = 4
+  elif color_mode == 'grayscale':
+    num_channels = 1
+  else:
+    raise ValueError(
+        '`color_mode` must be one of {"rbg", "rgba", "grayscale"}. '
+        'Received: %s' % (color_mode,))
+  interpolation = image_preprocessing.get_interpolation(interpolation)
+
+  inferred_class_names = []
+  for subdir in sorted(os.listdir(directory)):
+    if os.path.isdir(os.path.join(directory, subdir)):
+      inferred_class_names.append(subdir)
+  if not class_names:
+    class_names = inferred_class_names
+  else:
+    if set(class_names) != set(inferred_class_names):
+      raise ValueError(
+          'The `class_names` passed did not match the '
+          'names of the subdirectories of the target directory. '
+          'Expected: %s, but received: %s' %
+          (inferred_class_names, class_names))
+  class_indices = dict(zip(class_names, range(len(class_names))))
+
+  if label_mode == 'binary' and len(class_names) != 2:
+    raise ValueError(
+        'When passing `label_mode="binary", there must exactly 2 classes. '
+        'Found the following classes: %s' % (class_names,))
+
+  # Build an index of the images
+  # in the different class subfolders.
+  pool = multiprocessing.pool.ThreadPool()
+  results = []
+  filenames = []
+  for dirpath in (os.path.join(directory, subdir) for subdir in class_names):
+    results.append(
+        pool.apply_async(list_labeled_images_in_directory,
+                         (dirpath, class_indices, follow_links)))
+  labels_list = []
+  for res in results:
+    partial_labels, partial_filenames = res.get()
+    labels_list.append(partial_labels)
+    filenames += partial_filenames
+  if labels != 'inferred':
+    if len(labels) != len(filenames):
+      raise ValueError('Expected the lengths of `labels` to match the number '
+                       'of images in the target directory. len(labels) is %s '
+                       'while we found %s images in %s.' % (
+                           len(labels), len(filenames), directory))
+  else:
+    i = 0
+    labels = np.zeros((len(filenames),), dtype='int32')
+    for partial_labels in labels_list:
+      labels[i:i + len(partial_labels)] = partial_labels
+      i += len(partial_labels)
+
+  print('Found %d images belonging to %d classes.' %
+        (len(filenames), len(class_names)))
+  pool.close()
+  pool.join()
+  image_paths = [os.path.join(directory, fname) for fname in filenames]
+
+  if shuffle:
+    # Shuffle globally to erase macro-structure
+    # (the dataset will be further shuffled within a local buffer
+    # at each iteration)
+    if seed is None:
+      seed = np.random.randint(1e6)
+    rng = np.random.RandomState(seed)
+    rng.shuffle(image_paths)
+    rng = np.random.RandomState(seed)
+    rng.shuffle(labels)
+
+  if validation_split:
+    if not 0 < validation_split < 1:
+      raise ValueError(
+          '`validation_split` must be between 0 and 1, received: %s' %
+          (validation_split,))
+    num_val_samples = int(validation_split * len(image_paths))
+    if subset == 'training':
+      image_paths = image_paths[:-num_val_samples]
+      labels = labels[:-num_val_samples]
+    elif subset == 'validation':
+      image_paths = image_paths[-num_val_samples:]
+      labels = labels[-num_val_samples:]
+    else:
+      raise ValueError('`subset` must be either "training" '
+                       'or "validation", received: %s' % (subset,))
+  dataset = paths_and_labels_to_dataset(
+      image_paths=image_paths,
+      image_size=image_size,
+      num_channels=num_channels,
+      labels=labels,
+      label_mode=label_mode,
+      num_classes=len(class_names),
+      interpolation=interpolation)
+  if shuffle:
+    # Shuffle locally at each iteration
+    dataset = dataset.shuffle(buffer_size=batch_size * 8, seed=seed)
+  dataset = dataset.batch(batch_size)
+  return dataset
+
+
+def paths_and_labels_to_dataset(image_paths,
+                                image_size,
+                                num_channels,
+                                labels,
+                                label_mode,
+                                num_classes,
+                                interpolation):
+  """Constructs a dataset of images and labels."""
+  # TODO(fchollet): consider making num_parallel_calls settable
+  path_ds = dataset_ops.Dataset.from_tensor_slices(image_paths)
+  img_ds = path_ds.map(
+      lambda x: path_to_image(x, image_size, num_channels, interpolation))
+  if label_mode:
+    label_ds = dataset_ops.Dataset.from_tensor_slices(labels)
+    if label_mode == 'binary':
+      label_ds = label_ds.map(
+          lambda x: array_ops.expand_dims(math_ops.cast(x, 'float32'), axis=-1))
+    elif label_mode == 'categorical':
+      label_ds = label_ds.map(lambda x: array_ops.one_hot(x, num_classes))
+    img_ds = dataset_ops.Dataset.zip((img_ds, label_ds))
+  return img_ds
+
+
+def iter_valid_files(directory, follow_links):
+  walk = os.walk(directory, followlinks=follow_links)
+  for root, _, files in sorted(walk, key=lambda x: x[0]):
+    for fname in sorted(files):
+      if fname.lower().endswith(WHITELIST_FORMATS):
+        yield root, fname
+
+
+def list_labeled_images_in_directory(directory, class_indices, follow_links):
+  """Recursively walks directory and list image paths and their class index.
+
+  Arguments:
+    directory: string, target directory.
+    class_indices: dict mapping class names to their index.
+    follow_links: boolean, whether to recursively follow subdirectories
+      (if False, we only list top-level images in `directory`).
+
+  Returns:
+    tuple `(labels, filenames)`. `labels` is a list of integer
+      labels and `filenames` is a list of relative image paths corresponding
+      to these labels.
+  """
+  dirname = os.path.basename(directory)
+  valid_files = iter_valid_files(directory, follow_links)
+  labels = []
+  filenames = []
+  for root, fname in valid_files:
+    labels.append(class_indices[dirname])
+    absolute_path = os.path.join(root, fname)
+    relative_path = os.path.join(
+        dirname, os.path.relpath(absolute_path, directory))
+    filenames.append(relative_path)
+  return labels, filenames
+
+
+def path_to_image(path, image_size, num_channels, interpolation):
+  img = io_ops.read_file(path)
+  img = image_ops.decode_image(
+      img, channels=num_channels, expand_animations=False)
+  return image_ops.resize_images_v2(img, image_size, method=interpolation)
diff --git a/tensorflow/python/keras/preprocessing/image_pipeline_test.py b/tensorflow/python/keras/preprocessing/image_pipeline_test.py
new file mode 100644
index 00000000000..cae0780d90b
--- /dev/null
+++ b/tensorflow/python/keras/preprocessing/image_pipeline_test.py
@@ -0,0 +1,286 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for image_pipeline."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+
+import numpy as np
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.preprocessing import image as image_preproc
+from tensorflow.python.keras.preprocessing import image_pipeline
+from tensorflow.python.platform import test
+
+try:
+  import PIL  # pylint:disable=g-import-not-at-top
+except ImportError:
+  PIL = None
+
+
+class DatasetFromDirectoryTest(keras_parameterized.TestCase):
+
+  def _get_images(self, count=16, color_mode='rgb'):
+    width = height = 24
+    imgs = []
+    for _ in range(count):
+      if color_mode == 'grayscale':
+        img = np.random.randint(0, 256, size=(height, width, 1))
+      elif color_mode == 'rgba':
+        img = np.random.randint(0, 256, size=(height, width, 4))
+      else:
+        img = np.random.randint(0, 256, size=(height, width, 3))
+      img = image_preproc.array_to_img(img)
+      imgs.append(img)
+    return imgs
+
+  def _prepare_directory(self,
+                         num_classes=2,
+                         grayscale=False,
+                         nested_dirs=False,
+                         color_mode='rgb',
+                         count=16):
+    # Get a unique temp directory
+    temp_dir = os.path.join(self.get_temp_dir(), str(np.random.randint(1e6)))
+    os.mkdir(temp_dir)
+    self.addCleanup(shutil.rmtree, temp_dir)
+
+    # Generate paths to class subdirectories
+    paths = []
+    for class_index in range(num_classes):
+      class_directory = 'class_%s' % (class_index,)
+      if nested_dirs:
+        class_paths = [
+            class_directory, os.path.join(class_directory, 'subfolder_1'),
+            os.path.join(class_directory, 'subfolder_2'), os.path.join(
+                class_directory, 'subfolder_1', 'sub-subfolder')
+        ]
+      else:
+        class_paths = [class_directory]
+      for path in class_paths:
+        os.mkdir(os.path.join(temp_dir, path))
+      paths += class_paths
+
+    # Save images to the paths
+    i = 0
+    for img in self._get_images(color_mode=color_mode, count=count):
+      path = paths[count % len(paths)]
+      if color_mode == 'rgb':
+        ext = 'jpg'
+      else:
+        ext = 'png'
+      filename = os.path.join(path, 'image_%s.%s' % (i, ext))
+      img.save(os.path.join(temp_dir, filename))
+      i += 1
+    return temp_dir
+
+  def test_dataset_from_directory_binary(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    directory = self._prepare_directory(num_classes=2)
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18), label_mode='int')
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (8, 18, 18, 3))
+    self.assertEqual(batch[0].dtype.name, 'float32')
+    self.assertEqual(batch[1].shape, (8,))
+    self.assertEqual(batch[1].dtype.name, 'int32')
+
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18), label_mode='binary')
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (8, 18, 18, 3))
+    self.assertEqual(batch[0].dtype.name, 'float32')
+    self.assertEqual(batch[1].shape, (8, 1))
+    self.assertEqual(batch[1].dtype.name, 'float32')
+
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18), label_mode='categorical')
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (8, 18, 18, 3))
+    self.assertEqual(batch[0].dtype.name, 'float32')
+    self.assertEqual(batch[1].shape, (8, 2))
+    self.assertEqual(batch[1].dtype.name, 'float32')
+
+  def test_sample_count(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    directory = self._prepare_directory(num_classes=4, count=15)
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18), label_mode=None)
+    sample_count = 0
+    for batch in dataset:
+      sample_count += batch.shape[0]
+    self.assertEqual(sample_count, 15)
+
+  def test_dataset_from_directory_multiclass(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    directory = self._prepare_directory(num_classes=4, count=15)
+
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18), label_mode=None)
+    batch = next(iter(dataset))
+    self.assertEqual(batch.shape, (8, 18, 18, 3))
+
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18), label_mode=None)
+    sample_count = 0
+    iterator = iter(dataset)
+    for batch in dataset:
+      sample_count += next(iterator).shape[0]
+    self.assertEqual(sample_count, 15)
+
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18), label_mode='int')
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (8, 18, 18, 3))
+    self.assertEqual(batch[0].dtype.name, 'float32')
+    self.assertEqual(batch[1].shape, (8,))
+    self.assertEqual(batch[1].dtype.name, 'int32')
+
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18), label_mode='categorical')
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (8, 18, 18, 3))
+    self.assertEqual(batch[0].dtype.name, 'float32')
+    self.assertEqual(batch[1].shape, (8, 4))
+    self.assertEqual(batch[1].dtype.name, 'float32')
+
+  def test_dataset_from_directory_color_modes(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    directory = self._prepare_directory(num_classes=4, color_mode='rgba')
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18), color_mode='rgba')
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (8, 18, 18, 4))
+    self.assertEqual(batch[0].dtype.name, 'float32')
+
+    directory = self._prepare_directory(num_classes=4, color_mode='grayscale')
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18), color_mode='grayscale')
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (8, 18, 18, 1))
+    self.assertEqual(batch[0].dtype.name, 'float32')
+
+  def test_dataset_from_directory_validation_split(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    directory = self._prepare_directory(num_classes=2, count=10)
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=10, image_size=(18, 18),
+        validation_split=0.2, subset='training')
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (8, 18, 18, 3))
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=10, image_size=(18, 18),
+        validation_split=0.2, subset='validation')
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertEqual(batch[0].shape, (2, 18, 18, 3))
+
+  def test_dataset_from_directory_manual_labels(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    directory = self._prepare_directory(num_classes=2, count=2)
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18),
+        labels=[0, 1], shuffle=False)
+    batch = next(iter(dataset))
+    self.assertLen(batch, 2)
+    self.assertAllClose(batch[1], [0, 1])
+
+  def test_dataset_from_directory_follow_links(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    directory = self._prepare_directory(num_classes=2, count=25,
+                                        nested_dirs=True)
+    dataset = image_pipeline.dataset_from_directory(
+        directory, batch_size=8, image_size=(18, 18), label_mode=None,
+        follow_links=True)
+    sample_count = 0
+    for batch in dataset:
+      sample_count += batch.shape[0]
+    self.assertEqual(sample_count, 25)
+
+  def test_dataset_from_directory_errors(self):
+    if PIL is None:
+      return  # Skip test if PIL is not available.
+
+    directory = self._prepare_directory(num_classes=3, count=5)
+
+    with self.assertRaisesRegex(ValueError, '`labels` argument should be'):
+      _ = image_pipeline.dataset_from_directory(
+          directory, labels=None)
+
+    with self.assertRaisesRegex(ValueError, '`label_mode` argument must be'):
+      _ = image_pipeline.dataset_from_directory(directory, label_mode='other')
+
+    with self.assertRaisesRegex(ValueError, '`color_mode` must be one of'):
+      _ = image_pipeline.dataset_from_directory(directory, color_mode='other')
+
+    with self.assertRaisesRegex(
+        ValueError, 'only pass `class_names` if the labels are inferred'):
+      _ = image_pipeline.dataset_from_directory(
+          directory, labels=[0, 0, 1, 1, 1],
+          class_names=['class_0', 'class_1', 'class_2'])
+
+    with self.assertRaisesRegex(
+        ValueError,
+        'Expected the lengths of `labels` to match the number of images'):
+      _ = image_pipeline.dataset_from_directory(directory, labels=[0, 0, 1, 1])
+
+    with self.assertRaisesRegex(
+        ValueError, '`class_names` passed did not match'):
+      _ = image_pipeline.dataset_from_directory(
+          directory, class_names=['class_0', 'class_2'])
+
+    with self.assertRaisesRegex(ValueError, 'there must exactly 2 classes'):
+      _ = image_pipeline.dataset_from_directory(directory, label_mode='binary')
+
+    with self.assertRaisesRegex(ValueError,
+                                '`validation_split` must be between 0 and 1'):
+      _ = image_pipeline.dataset_from_directory(directory, validation_split=2)
+
+    with self.assertRaisesRegex(ValueError,
+                                '`subset` must be either "training" or'):
+      _ = image_pipeline.dataset_from_directory(
+          directory, validation_split=0.2, subset='other')
+
+
+if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
+  test.main()

From 746033d4de3a34c8cf40d3ccc69d0885e071f2e0 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Thu, 20 Feb 2020 16:27:58 -0800
Subject: [PATCH 405/442] Fixed mean work group size for Adreno3xx.

PiperOrigin-RevId: 296323931
Change-Id: If3555991cb399a9e3f850541276ec8d132c20e37
---
 tensorflow/lite/delegates/gpu/cl/kernels/mean.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
index a22037d46b6..9dd0546c059 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
@@ -104,6 +104,9 @@ Mean& Mean::operator=(Mean&& operation) {
 }
 
 Status Mean::Compile(const CreationContext& creation_context) {
+  if (creation_context.device->IsAdreno3xx()) {
+    work_group_size_ = int3(16, 8, 1);
+  }
   const auto code =
       GetMeanKernelCode(definition_, linked_operations_, work_group_size_);
   return creation_context.cache->GetOrCreateCLKernel(

From 3839c42f03ed942c2698a2c8a33e1fefefad48bb Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Thu, 20 Feb 2020 16:34:15 -0800
Subject: [PATCH 406/442] Improved workgroup selection for Winograd
 transformations.

PiperOrigin-RevId: 296325182
Change-Id: Id8a593564142316cfdff3b5c9357af11358a6c19
---
 .../lite/delegates/gpu/cl/kernels/winograd.cc | 41 +++++++++++++++++--
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
index 868cca55882..e2b7c43285e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_format.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
@@ -442,8 +443,24 @@ int3 Winograd4x4To36::GetGridSize() const {
 }
 
 Status Winograd4x4To36::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+  switch (params.tuning_type) {
+    case TuningType::FAST:
+      work_group_size_ = {8, 6, 4};
+      if (params.info->vendor == Vendor::QUALCOMM) {
+        auto adreno_info = params.info->adreno_info;
+        if (adreno_info.gpu_version < 400) {
+          work_group_size_ = {4, 6, 2};
+        }
+      }
+      return OkStatus();
+    case TuningType::EXHAUSTIVE:
+      RETURN_IF_ERROR(BindArguments());
+      return GetBestWorkGroup(params, kernel_, GetGridSize(),
+                              &work_group_size_);
+    default:
+      work_group_size_ = {4, 6, 2};
+      return OkStatus();
+  }
 }
 
 Status Winograd4x4To36::AddToQueue(CLCommandQueue* queue) {
@@ -530,8 +547,24 @@ int3 Winograd36To4x4::GetGridSize() const {
 }
 
 Status Winograd36To4x4::Tune(const TuningParameters& params) {
-  RETURN_IF_ERROR(BindArguments());
-  return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
+  switch (params.tuning_type) {
+    case TuningType::FAST:
+      work_group_size_ = {32, 4, 2};
+      if (params.info->vendor == Vendor::QUALCOMM) {
+        auto adreno_info = params.info->adreno_info;
+        if (adreno_info.gpu_version < 400) {
+          work_group_size_ = {16, 4, 1};
+        }
+      }
+      return OkStatus();
+    case TuningType::EXHAUSTIVE:
+      RETURN_IF_ERROR(BindArguments());
+      return GetBestWorkGroup(params, kernel_, GetGridSize(),
+                              &work_group_size_);
+    default:
+      work_group_size_ = {16, 4, 1};
+      return OkStatus();
+  }
 }
 
 Status Winograd36To4x4::AddToQueue(CLCommandQueue* queue) {

From 969489871031946f438d0899f3a0815270863296 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Thu, 20 Feb 2020 16:47:06 -0800
Subject: [PATCH 407/442] [XLA:Python] Add TopK operation to Python API.

PiperOrigin-RevId: 296327503
Change-Id: I345150c480b48ba97645376674faa6109f6631a7
---
 tensorflow/compiler/xla/python/BUILD         | 1 +
 tensorflow/compiler/xla/python/xla.cc        | 2 ++
 tensorflow/compiler/xla/python/xla_client.py | 1 +
 3 files changed, 4 insertions(+)

diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 44f7061d1ac..d6c1a034859 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -353,6 +353,7 @@ pybind_extension(
         "//tensorflow/compiler/xla/client/lib:math",
         "//tensorflow/compiler/xla/client/lib:qr",
         "//tensorflow/compiler/xla/client/lib:self_adjoint_eig",
+        "//tensorflow/compiler/xla/client/lib:sorting",
         "//tensorflow/compiler/xla/client/lib:svd",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:custom_call_target_registry",
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index cd85edad13e..4be375ac15a 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/lib/qr.h"
 #include "tensorflow/compiler/xla/client/lib/self_adjoint_eig.h"
+#include "tensorflow/compiler/xla/client/lib/sorting.h"
 #include "tensorflow/compiler/xla/client/lib/svd.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -454,6 +455,7 @@ void BuildOpsSubmodule(py::module* m) {
       },
       py::arg("builder"), py::arg("operands"), py::arg("dimension") = -1,
       py::arg("comparator") = absl::nullopt);
+  ops.def("TopK", &TopK, py::arg("input"), py::arg("k"));
   ops.def("Transpose", &Transpose);
   ops.def("TriangularSolve", &TriangularSolve);
   ops.def("Tuple", &Tuple);
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 6574ccfe898..9d53f9bd082 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -1725,6 +1725,7 @@ _OTHER_OPS = [
     'Rev',
     'Select',
     'SliceInDim',
+    'TopK',
 ]
 
 

From 7fac3faba23fbe39f47fa2ae1add788144d12fa6 Mon Sep 17 00:00:00 2001
From: Niranjan Hasabnis <niranjan.hasabnis@intel.com>
Date: Thu, 20 Feb 2020 16:52:54 -0800
Subject: [PATCH 408/442] [Intel MKL] Fixing Device() API usage in
 mkl_eager_op_rewrite.cc

API change in TF core caused a build failure in MKL CPU backend. This commit fixes the compilation failure.
---
 .../core/common_runtime/eager/mkl_eager_op_rewrite.cc     | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
index a222724ec8f..346ccc11ca8 100644
--- a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
+++ b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
@@ -135,11 +135,13 @@ Status MklEagerOpRewrite::SetupNewOp(
       ->MutableAttrs()
       ->Set("_kernel", mkl_op_registry::kMklNameChangeOpLabel);
 
-  if (orig_op->Device() != nullptr) {
-    (*new_mkl_op)->SetDevice(orig_op->Device());
-  } else {
+  if (orig_op->Device() == kVariantDeviceNull) {
     string device_name = orig_op->GetDeviceName();
     (*new_mkl_op)->SetDeviceName(device_name.c_str());
+  } else if (VariantDeviceIsCustom(orig_op->Device())) {
+    (*new_mkl_op)->SetDevice(absl::get<CustomDevice*>(orig_op->Device()));
+  } else {
+    (*new_mkl_op)->SetDevice(absl::get<Device*>(orig_op->Device()));
   }
   return Status::OK();
 }

From 32ec07120eb3a2fad9f3cff470f8c5f2c4b84d41 Mon Sep 17 00:00:00 2001
From: Rick Chao <rchao@google.com>
Date: Thu, 20 Feb 2020 16:49:50 -0800
Subject: [PATCH 409/442] Supplement multi_worker_callback_tf2_test.py with
 tests that have the same file path for ModelCheckpoint and TensorBoard
 callbacks.

PiperOrigin-RevId: 296328002
Change-Id: I28bb1d4b60e1fb47c1570852fe82d71adc6ebffe
---
 .../multi_worker_callback_tf2_test.py         | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py b/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
index 920624f83fd..7e2ec19e15e 100644
--- a/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
@@ -109,6 +109,32 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
         cluster_spec=test_base.create_cluster_spec(num_workers=2),
         args=(self, file_format))
 
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def test_model_checkpoint_works_with_same_file_path(self, mode):
+
+    def proc_model_checkpoint_works_with_same_file_path(
+        test_obj, saving_filepath):
+      model, _, train_ds, steps = _model_setup(test_obj, file_format='')
+      num_epoch = 10
+
+      # The saving_filepath shouldn't exist at the beginning (as it's unique).
+      test_obj.assertFalse(file_io.file_exists(saving_filepath))
+
+      model.fit(
+          x=train_ds,
+          epochs=num_epoch,
+          steps_per_epoch=steps,
+          callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)])
+
+      test_obj.assertTrue(file_io.file_exists(saving_filepath))
+
+    saving_filepath = os.path.join(self.get_temp_dir(), 'checkpoint')
+
+    multi_process_runner.run(
+        proc_model_checkpoint_works_with_same_file_path,
+        cluster_spec=test_base.create_cluster_spec(num_workers=20),
+        args=(self, saving_filepath))
+
   @combinations.generate(combinations.combine(mode=['eager']))
   def test_tensorboard_saves_on_chief_but_not_otherwise(self, mode):
 
@@ -174,6 +200,31 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
         cluster_spec=test_base.create_cluster_spec(num_workers=2),
         args=(self,))
 
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def test_tensorboard_works_with_same_file_path(self, mode):
+
+    def proc_tensorboard_works_with_same_file_path(test_obj, saving_filepath):
+      model, _, train_ds, steps = _model_setup(test_obj, file_format='')
+      num_epoch = 2
+
+      # The saving_filepath shouldn't exist at the beginning (as it's unique).
+      test_obj.assertFalse(file_io.file_exists(saving_filepath))
+
+      model.fit(
+          x=train_ds,
+          epochs=num_epoch,
+          steps_per_epoch=steps,
+          callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)])
+
+      test_obj.assertTrue(file_io.list_directory(saving_filepath))
+
+    saving_filepath = os.path.join(self.get_temp_dir(), 'logfile')
+
+    multi_process_runner.run(
+        proc_tensorboard_works_with_same_file_path,
+        cluster_spec=test_base.create_cluster_spec(num_workers=2),
+        args=(self, saving_filepath))
+
   @combinations.generate(combinations.combine(mode=['eager']))
   def test_early_stopping(self, mode):
 

From 2f7455d56c8328fd1b232e5bca68b636e0a34822 Mon Sep 17 00:00:00 2001
From: Haoliang Zhang <haoliang@google.com>
Date: Thu, 20 Feb 2020 16:54:55 -0800
Subject: [PATCH 410/442] Update minimum op version for TF 2.2.0 branch cut.

PiperOrigin-RevId: 296328883
Change-Id: I3deda696e7ad2c35cbd580decd72ca79e91963e4
---
 tensorflow/lite/toco/tflite/op_version.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/toco/tflite/op_version.cc b/tensorflow/lite/toco/tflite/op_version.cc
index a696306c8e5..49b7ed5c38d 100644
--- a/tensorflow/lite/toco/tflite/op_version.cc
+++ b/tensorflow/lite/toco/tflite/op_version.cc
@@ -89,7 +89,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kGatherNd, 1}, "1.14.0"},
           {{OperatorType::kSvdf, 1}, "1.5.0"},
           {{OperatorType::kSvdf, 2}, "1.14.0"},
-          {{OperatorType::kSvdf, 3}, kPendingReleaseOpVersion},
+          {{OperatorType::kSvdf, 3}, "2.2.0"},
           {{OperatorType::kL2Normalization, 1}, "1.5.0"},
           {{OperatorType::kL2Normalization, 2}, "1.14.0"},
           {{OperatorType::kL2Pool, 1}, "1.5.0"},
@@ -137,7 +137,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kRelu6, 2}, "1.14.0"},
           {{OperatorType::kResizeBilinear, 1}, "1.7.0"},
           {{OperatorType::kResizeBilinear, 2}, "1.14.0"},
-          {{OperatorType::kResizeBilinear, 3}, kPendingReleaseOpVersion},
+          {{OperatorType::kResizeBilinear, 3}, "2.2.0"},
           {{OperatorType::kResizeNearestNeighbor, 1}, "1.13.1"},
           {{OperatorType::kResizeNearestNeighbor, 2}, "1.14.0"},
           {{OperatorType::kSqueeze, 1}, "1.6.0"},
@@ -171,7 +171,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kCTCBeamSearchDecoder, 1}, "1.11.0"},
           {{OperatorType::kUnpack, 1}, "1.11.0"},
           {{OperatorType::kUnpack, 2}, "1.14.0"},
-          {{OperatorType::kUnpack, 3}, kPendingReleaseOpVersion},
+          {{OperatorType::kUnpack, 3}, "2.2.0"},
           {{OperatorType::kLeakyRelu, 1}, "1.13.1"},
           {{OperatorType::kLogistic, 1}, "1.14.0"},
           {{OperatorType::kLogistic, 2}, "1.14.0"},
@@ -198,10 +198,10 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kLess, 2}, "1.14.0"},
           {{OperatorType::kLessEqual, 1}, "1.14.0"},
           {{OperatorType::kLessEqual, 2}, "1.14.0"},
-          {{OperatorType::kSegmentSum, 1}, kPendingReleaseOpVersion},
+          {{OperatorType::kSegmentSum, 1}, "2.2.0"},
           {{OperatorType::kSelect, 1}, "1.14.0"},
           {{OperatorType::kSelect, 2}, "1.14.0"},
-          {{OperatorType::kSelectV2, 1}, kPendingReleaseOpVersion},
+          {{OperatorType::kSelectV2, 1}, "2.2.0"},
           {{OperatorType::kFloorDiv, 1}, "1.14.0"},
           {{OperatorType::kFloorDiv, 2}, "1.14.0"},
           {{OperatorType::kFloor, 1}, "1.9.0"},
@@ -232,7 +232,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kHardSwish, 1}, "1.15.0"},
           {{OperatorType::kFill, 1}, "1.13.0"},
           {{OperatorType::kReverseV2, 1}, "1.14.0"},
-          {{OperatorType::kReverseV2, 2}, kPendingReleaseOpVersion},
+          {{OperatorType::kReverseV2, 2}, "2.2.0"},
           {{OperatorType::kRank, 1}, "1.14.0"},
       });
 

From f59fbde36dc0072370f59c81e0de42a3ba2de490 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Thu, 20 Feb 2020 16:59:49 -0800
Subject: [PATCH 411/442] Added flag for separate weights for H dimension in
 ConvTexture. This flags allows to execute convolution for winograd
 transformed input.

PiperOrigin-RevId: 296329659
Change-Id: I97b87f7b7f69b11452173e1e450b59542befa3cb
---
 .../delegates/gpu/cl/kernels/conv_texture.cc  | 34 +++++++---
 .../delegates/gpu/cl/kernels/conv_texture.h   | 62 ++++++++++++++++---
 2 files changed, 79 insertions(+), 17 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
index 4eefb3de52a..780d6646ea8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
@@ -32,7 +32,8 @@ namespace cl {
 namespace {
 std::string GenerateConvCode(
     const OperationDef& op_def, const int3& block_size, bool is1x1,
-    bool adreno4xx_optimization, bool stride_correction, const CLDevice& device,
+    bool adreno4xx_optimization, bool stride_correction,
+    bool different_weights_for_height, const CLDevice& device,
     const std::vector<ElementwiseOperation*>& linked_operations) {
   std::string c = GetCommonDefines(op_def.precision);
   TensorCodeGenerator src_tensor(
@@ -128,6 +129,9 @@ std::string GenerateConvCode(
          " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
   }
   std::string f_y = is1x1 ? "s" : "filter_offset";
+  if (different_weights_for_height) {
+    f_y = "Y * src_size.z + s";
+  }
   if (!is1x1) {
     for (int x = 0; x < block_size.x; ++x) {
       c += "  int cx" + xs[x] + ";\n";
@@ -329,16 +333,17 @@ ConvTexture::ConvTexture(const OperationDef& definition,
       stride_(attr.strides.w, attr.strides.h),
       padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
       dilation_(attr.dilations.w, attr.dilations.h),
+      different_weights_for_height_(false),
       block_size_(2, 2, 2),
       work_group_size_(4, 4, 2) {}
 
-ConvTexture::ConvTexture(const OperationDef& definition,
-                         const FullyConnectedAttributes& attr)
+ConvTexture::ConvTexture(const OperationDef& definition)
     : GPUOperation(definition),
       kernel_size_(1, 1),
       stride_(1, 1),
       padding_(0, 0),
       dilation_(1, 1),
+      different_weights_for_height_(false),
       block_size_(4, 1, 2),
       work_group_size_(16, 1, 2) {}
 
@@ -353,6 +358,7 @@ ConvTexture::ConvTexture(ConvTexture&& operation)
       stride_(operation.stride_),
       padding_(operation.padding_),
       dilation_(operation.dilation_),
+      different_weights_for_height_(operation.different_weights_for_height_),
       block_size_(operation.block_size_),
       kernel_(std::move(operation.kernel_)),
       work_group_size_(operation.work_group_size_) {}
@@ -368,6 +374,8 @@ ConvTexture& ConvTexture::operator=(ConvTexture&& operation) {
     std::swap(stride_, operation.stride_);
     std::swap(padding_, operation.padding_);
     std::swap(dilation_, operation.dilation_);
+    std::swap(different_weights_for_height_,
+              operation.different_weights_for_height_);
     std::swap(block_size_, operation.block_size_);
     kernel_ = std::move(operation.kernel_);
     std::swap(work_group_size_, operation.work_group_size_);
@@ -386,9 +394,10 @@ Status ConvTexture::Compile(const CreationContext& creation_context) {
       definition_.precision == CalculationsPrecision::F16;
   const bool stride_correction =
       definition_.IsBatchSupported() && stride_.x != 1;
-  const std::string code = GenerateConvCode(
-      definition_, block_size_, is1x1, adreno4xx_optimization,
-      stride_correction, *creation_context.device, linked_operations_);
+  const std::string code =
+      GenerateConvCode(definition_, block_size_, is1x1, adreno4xx_optimization,
+                       stride_correction, different_weights_for_height_,
+                       *creation_context.device, linked_operations_);
   std::vector<CompilerOptions> options;
   if (UseFP16SIMD(*creation_context.device, definition_.precision, is1x1)) {
     options.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
@@ -452,10 +461,21 @@ Status CreateConvTexture(const CreationContext& creation_context,
                          const OperationDef& definition,
                          const FullyConnectedAttributes& attr,
                          ConvTexture* result) {
-  *result = ConvTexture(definition, attr);
+  *result = ConvTexture(definition);
   return result->UploadData(attr.weights, attr.bias, creation_context.context);
 }
 
+Status CreateConvTextureWino4x4To6x6(const CreationContext& creation_context,
+                                     const OperationDef& definition,
+                                     const Convolution2DAttributes& attr,
+                                     ConvTexture* result) {
+  *result = ConvTexture(definition);
+  result->different_weights_for_height_ = true;
+  result->block_size_ = {4, 1, 2};
+  return result->UploadDataForWinograd4x4To6x6(
+      attr.weights, *creation_context.device, creation_context.context);
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
index 4332659b2c2..fb25f655057 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor.h"
 #include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
@@ -60,15 +61,24 @@ class ConvTexture : public GPUOperation {
                                   const OperationDef& definition,
                                   const FullyConnectedAttributes& attr,
                                   ConvTexture* result);
+
+  friend Status CreateConvTextureWino4x4To6x6(
+      const CreationContext& creation_context, const OperationDef& definition,
+      const Convolution2DAttributes& attr, ConvTexture* result);
+
   ConvTexture(const OperationDef& definition,
               const Convolution2DAttributes& attr);
-  ConvTexture(const OperationDef& definition,
-              const FullyConnectedAttributes& attr);
+  explicit ConvTexture(const OperationDef& definition);
   template <DataType T>
   Status UploadData(const ::tflite::gpu::Tensor<OHWI, T>& weights,
                     const ::tflite::gpu::Tensor<Linear, T>& biases,
                     CLContext* context);
 
+  template <DataType T>
+  Status UploadDataForWinograd4x4To6x6(
+      const ::tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
+      CLContext* context);
+
   template <DataType T>
   Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
                        CLContext* context);
@@ -92,6 +102,11 @@ class ConvTexture : public GPUOperation {
   int2 padding_;
   int2 dilation_;
 
+  // By default in 2d convolution we have the same weights for WH dims, but in
+  // some cases we need separate weights for H dimension and convolution kernel
+  // requires very small modifications to support it.
+  bool different_weights_for_height_;
+
   int3 block_size_ = int3(2, 2, 2);
 
   CLKernel kernel_;
@@ -111,15 +126,35 @@ Status ConvTexture::UploadData(const ::tflite::gpu::Tensor<OHWI, T>& weights,
   return OkStatus();
 }
 
+template <DataType T>
+Status ConvTexture::UploadDataForWinograd4x4To6x6(
+    const ::tflite::gpu::Tensor<OHWI, T>& weights, const CLDevice& device,
+    CLContext* context) {
+  ::tflite::gpu::Tensor<OHWI, T> wino_weights;
+  RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
+  RETURN_IF_ERROR(UploadWeights(wino_weights, context));
+
+  LinearStorageCreateInfo create_info;
+  create_info.storage_type = LinearStorageType::TEXTURE_2D;
+  create_info.data_type = definition_.GetDataType();
+  create_info.aligned_size = 1;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
+  bias.shape = Linear(1);
+  bias.data = {0.0f};
+  return CreateLinearStorage(create_info, bias, context, &biases_);
+}
+
 template <DataType T>
 Status ConvTexture::UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
                                   CLContext* context) {
-  const int dst_depth =
-      AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.z);
+  int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  dst_depth = AlignByN(dst_depth, block_size_.z);
   const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
 
   int texture_width = dst_depth;
-  int texture_height = src_depth * kernel_size_.x * kernel_size_.y;
+  int texture_height = src_depth * kernel_x * kernel_y;
 
   DataType data_type = definition_.GetDataType();
 
@@ -170,15 +205,17 @@ template <DataType S, typename T>
 void ConvTexture::RearrangeWeightsData(
     const ::tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst_0,
     absl::Span<T> dst_1, absl::Span<T> dst_2, absl::Span<T> dst_3) {
-  const int dst_depth =
-      AlignByN(IntegralDivideRoundUp(weights.shape.o, 4), block_size_.z);
+  int dst_depth = IntegralDivideRoundUp(weights.shape.o, 4);
+  dst_depth = AlignByN(dst_depth, block_size_.z);
   const int src_depth = IntegralDivideRoundUp(weights.shape.i, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
 
   int texture_width = dst_depth;
 
   for (int d = 0; d < dst_depth / block_size_.z; ++d) {
-    for (int y = 0; y < kernel_size_.y; ++y) {
-      for (int x = 0; x < kernel_size_.x; ++x) {
+    for (int y = 0; y < kernel_y; ++y) {
+      for (int x = 0; x < kernel_x; ++x) {
         for (int s = 0; s < src_depth; ++s) {
           for (int sub_d = 0; sub_d < block_size_.z; ++sub_d) {
             T filters[4];
@@ -196,7 +233,7 @@ void ConvTexture::RearrangeWeightsData(
               }
             }
             int x_coord = d * block_size_.z + sub_d;
-            int y_coord = (y * kernel_size_.x + x) * src_depth + s;
+            int y_coord = (y * kernel_x + x) * src_depth + s;
             int offset = y_coord * texture_width + x_coord;
             dst_0[offset] = filters[0];
             dst_1[offset] = filters[1];
@@ -219,6 +256,11 @@ Status CreateConvTexture(const CreationContext& creation_context,
                          const FullyConnectedAttributes& attr,
                          ConvTexture* result);
 
+Status CreateConvTextureWino4x4To6x6(const CreationContext& creation_context,
+                                     const OperationDef& definition,
+                                     const Convolution2DAttributes& attr,
+                                     ConvTexture* result);
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite

From 8fb6456c7a4541c20af63b99ae03a0f5e82c9a6d Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 20 Feb 2020 17:02:13 -0800
Subject: [PATCH 412/442] [TFRT] Add layout optimization pipeline to
 tf_to_corert

PiperOrigin-RevId: 296330081
Change-Id: I641a1677896e83b4b1b05d6d7688bb80c8dbeb8b
---
 tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc  |  5 +++++
 .../mlir/tensorflow/tests/device_assignment.mlir  |  4 +++-
 .../tensorflow/transforms/layout_optimization.cc  | 15 +++------------
 .../compiler/mlir/tensorflow/transforms/passes.h  | 13 +++++++++++++
 .../tensorflow/transforms/tf_device_assignment.cc |  9 +++++----
 5 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index e7c554d03a0..d4e59d7d1ee 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -552,7 +552,12 @@ static LogicalResult Verify(BiasAddOp op) {
   return success();
 }
 
+// TODO(ezhulenev): BiasAddOp is not really layout sensitive, it must only
+// support folding operand transposes.
 LogicalResult BiasAddOp::UpdateDataFormat(StringRef data_format) {
+  auto ranked = value().getType().dyn_cast<RankedTensorType>();
+  if (!ranked || ranked.getRank() != 4) return failure();
+
   return ::mlir::TF::UpdateDataFormat(data_format, this);
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/device_assignment.mlir b/tensorflow/compiler/mlir/tensorflow/tests/device_assignment.mlir
index 1f1e6c63f30..6971cf06648 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/device_assignment.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/device_assignment.mlir
@@ -9,5 +9,7 @@ func @device_test(%arg0: tensor<3x1xf32>) -> (tensor<3x3xf32>) {
   %1 = "tf.MatMul"(%arg0, %0) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "", transpose_a = false, transpose_b = false} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
   // CHECK: device = "cpu"
   %2 = "tf.Relu"(%1) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "cpu"} : (tensor<3x3xf32>) -> tensor<3x3xf32>
-  return %2 : tensor<3x3xf32>
+  // CHECK: device = "gpu"
+  %3 = "tf.Relu"(%2) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"]} : (tensor<3x3xf32>) -> tensor<3x3xf32>
+  return %3 : tensor<3x3xf32>
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
index 3fd410aa118..e16c71673e4 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/Pass/PassRegistry.h"  // TF:llvm-project
 #include "mlir/Transforms/Passes.h"  // TF:llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 
 #define DEBUG_TYPE "tf-layout-optimization"
 
@@ -30,16 +31,6 @@ namespace TF {
 
 namespace {
 
-// Layout optimization pipeline composes layout assignment and move transposes
-// passes to pick the optimal layout for all layout sensitive operations, and
-// cancel all redundant transposes.
-struct LayoutOptimizationPipelineOptions
-    : public PassPipelineOptions<LayoutOptimizationPipelineOptions> {
-  Option<std::string> force_data_format{
-      *this, "force-data-format",
-      llvm::cl::desc("Force data format for all layout sensitive ops")};
-};
-
 // LayoutAssignmentPass assigns optimal data layout (data format) for all
 // layout sensitive operations.
 class LayoutAssignmentPass : public FunctionPass<LayoutAssignmentPass> {
@@ -408,6 +399,8 @@ void MoveTransposesPass::runOnFunction() {
   });
 }
 
+}  // namespace
+
 void CreateLayoutOptimizationPipeline(
     OpPassManager& pm,  // NOLINT - MLIR contract is pass by mutable reference.
     const LayoutOptimizationPipelineOptions& options) {
@@ -423,8 +416,6 @@ void CreateLayoutOptimizationPipeline(
   pm.addPass(std::make_unique<MoveTransposesPass>(Direction::kEnd));
 }
 
-}  // namespace
-
 static PassRegistration<LayoutAssignmentPass> layout_assignment(
     "tf-layout-assignment", "Layout assignment pass");
 static PassRegistration<MoveTransposesPass> move_transposes(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index b9e7aae7c61..548fbf8a8cf 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -46,6 +46,19 @@ std::unique_ptr<OpPassBase<ModuleOp>> CreateTFShapeInferencePass();
 // Optimizes Tensorflow graph.
 std::unique_ptr<OpPassBase<FuncOp>> CreateTFOptimizePass();
 
+struct LayoutOptimizationPipelineOptions
+    : public PassPipelineOptions<LayoutOptimizationPipelineOptions> {
+  Option<std::string> force_data_format{
+      *this, "force-data-format",
+      llvm::cl::desc("Force data format for all layout sensitive ops")};
+};
+
+// Layout optimization assigns optimal data layout for layout sensitive
+// operations, and cancels all redundant transposes.
+void CreateLayoutOptimizationPipeline(
+    OpPassManager& pm,  // NOLINT - MLIR contract is pass by mutable reference.
+    const LayoutOptimizationPipelineOptions& options);
+
 struct StandardPipelineOptions
     : public PassPipelineOptions<StandardPipelineOptions> {
   Option<bool> enable_inliner{*this, "enable-inliner",
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_assignment.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_assignment.cc
index a4a8c1ab95f..83451e130ba 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_assignment.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_assignment.cc
@@ -34,15 +34,16 @@ class SimpleTFDeviceAssignmentPass
 
   void runOnFunction() override {
     Builder builder(&getContext());
-    getFunction().walk([this, &builder](Operation* op) {
+    Dialect* tf = getContext().getRegisteredDialect<TensorFlowDialect>();
+    getFunction().walk([&](Operation* op) {
       if (auto device_attr = op->getAttrOfType<StringAttr>("device")) {
         // We assign default device to ops with device attribute that is empty.
         if (device_attr.getValue() == "") {
           op->setAttr("device", builder.getStringAttr(default_device_));
         }
-      } else if (llvm::isa<ConstOp>(op)) {
-        // tf.Const may sometimes contain no device attribute. In this case, we
-        // assign it the default device.
+      } else if (op->getDialect() == tf) {
+        // Assign default device to all ops in Tensorflow dialect that do not
+        // have device attribute.
         op->setAttr("device", builder.getStringAttr(default_device_));
       }
     });

From 96016807abe3be02df4b51c7258fcf3dcdc48719 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Thu, 20 Feb 2020 17:02:36 -0800
Subject: [PATCH 413/442] Pack should verify compatible rather than exact type

Unknown types should not be flagged as illegal there as they could resolve to
valid types at runtime and so need not be illegal.

PiperOrigin-RevId: 296330175
Change-Id: I2664fd817b223f89d54e3301b263bfa770d7d24e
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.cc  | 9 ++++-----
 tensorflow/compiler/mlir/lite/tests/ops.mlir | 8 ++++++++
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index be70d20dc12..e73f6b732eb 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -723,12 +723,11 @@ static LogicalResult Verify(PackOp op) {
   }
 
   // Make sure all inputs have the same shape and element type.
-  // TODO(rahulsp): Simplify once b/135032064 is fixed.
-  for (Value operand : op.getOperands()) {
-    auto other_type = operand.getType().cast<ShapedType>();
-    if (input_type != other_type)
+  // TODO(b/135032063): Simplify once fixed.
+  for (Type operand_type : op.getOperandTypes()) {
+    if (failed(mlir::verifyCompatibleShape(input_type, operand_type)))
       return op.emitOpError("operands should be of the same type. got ")
-             << input_type << ", " << other_type;
+             << input_type << ", " << operand_type;
   }
 
   return success();
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index a1369fe969a..da58b3704d0 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -878,6 +878,14 @@ func @pack(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2x2xi32> {
 
 // -----
 
+func @packUnranked(%arg0: tensor<2xi32>, %arg1: tensor<*xi32>) -> tensor<2x2xi32> {
+  // CHECK: "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32}
+  %0 = "tfl.pack"(%arg0, %arg1) {axis = 0 : i32, values_count = 2 : i32} : (tensor<2xi32>, tensor<*xi32>) -> tensor<2x2xi32>
+  return %0 : tensor<2x2xi32>
+}
+
+// -----
+
 func @packInputRank(%arg0: tensor<1x4xi32>, %arg1: tensor<1x4xi32>) -> tensor<1x4x2xi32> {
   // CHECK: "tfl.pack"(%arg0, %arg1) {axis = 2 : i32, values_count = 2 : i32}
   %0 = "tfl.pack"(%arg0, %arg1) {axis = 2 : i32, values_count = 2 : i32} : (tensor<1x4xi32>, tensor<1x4xi32>) -> tensor<1x4x2xi32>

From 2292081bb0a277c705a35fd6a9a25ab2dbdbfbc2 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Thu, 20 Feb 2020 17:04:00 -0800
Subject: [PATCH 414/442] Remove unnecessary returns.

PiperOrigin-RevId: 296330462
Change-Id: I7e88ff943d02a160353b30d313a62e266bcad4cf
---
 .../internal/optimized/neon_tensor_utils.h     | 18 +++++++++---------
 .../internal/optimized/sse_tensor_utils.h      |  4 ++--
 .../internal/reference/portable_tensor_utils.h | 18 +++++++++---------
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
index ff16fa06ec1..5db126c1a11 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -57,9 +57,9 @@ void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ vectors, const float* scaling_factors,
     int n_batch, float* __restrict__ result, int result_stride,
     const float* per_channel_scale, const int32_t* input_offset) {
-  return NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows,
-                          m_cols, vectors, scaling_factors, n_batch, result,
-                          result_stride, per_channel_scale, input_offset);
+  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+                   vectors, scaling_factors, n_batch, result, result_stride,
+                   per_channel_scale, input_offset);
 }
 
 void MatrixBatchVectorMultiplyAccumulate(
@@ -69,10 +69,10 @@ void MatrixBatchVectorMultiplyAccumulate(
     const float* per_channel_scale, const int32_t* input_offset,
     int32_t* scratch, int32_t* row_sums, bool* compute_row_sums,
     CpuBackendContext* context) {
-  return NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows,
-                          m_cols, vectors, scaling_factors, n_batch, result,
-                          result_stride, per_channel_scale, input_offset,
-                          scratch, row_sums, compute_row_sums, context);
+  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+                   vectors, scaling_factors, n_batch, result, result_stride,
+                   per_channel_scale, input_offset, scratch, row_sums,
+                   compute_row_sums, context);
 }
 
 void SparseMatrixBatchVectorMultiplyAccumulate(
@@ -212,8 +212,8 @@ void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
 void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
                                       const int16_t* vector2, int v_size,
                                       int n_batch, int32_t* result) {
-  return PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size,
-                                                  n_batch, result);
+  PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch,
+                                           result);
 }
 
 void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
index 641c1df336d..fce9aeb1691 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
@@ -226,8 +226,8 @@ void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
 void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
                                       const int16_t* vector2, int v_size,
                                       int n_batch, int32_t* result) {
-  return PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size,
-                                                  n_batch, result);
+  PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch,
+                                           result);
 }
 
 void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index 43177cc0d9d..e51e5442c2a 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -41,22 +41,22 @@ bool IsZeroVector(const int8_t* vector, int v_size) {
 void SymmetricQuantizeFloats(const float* values, const int size,
                              int8_t* quantized_values, float* min, float* max,
                              float* scaling_factor) {
-  return PortableSymmetricQuantizeFloats(values, size, quantized_values, min,
-                                         max, scaling_factor);
+  PortableSymmetricQuantizeFloats(values, size, quantized_values, min, max,
+                                  scaling_factor);
 }
 
 void SymmetricQuantizeFloats(const float* values, const int size,
                              int8_t* quantized_values, float min_value,
                              float max_value, float* scaling_factor) {
-  return PortableSymmetricQuantizeFloats(values, size, quantized_values,
-                                         min_value, max_value, scaling_factor);
+  PortableSymmetricQuantizeFloats(values, size, quantized_values, min_value,
+                                  max_value, scaling_factor);
 }
 
 void AsymmetricQuantizeFloats(const float* values, const int size,
                               int8_t* quantized_values, float* scaling_factor,
                               int32_t* offset) {
-  return PortableAsymmetricQuantizeFloats(values, size, quantized_values,
-                                          scaling_factor, offset);
+  PortableAsymmetricQuantizeFloats(values, size, quantized_values,
+                                   scaling_factor, offset);
 }
 
 void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
@@ -104,7 +104,7 @@ void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ vectors, const float* scaling_factors,
     int n_batch, float* __restrict__ result, int result_stride,
     const float* per_channel_scale, const int32_t* input_offset) {
-  return PortableMatrixBatchVectorMultiplyAccumulate(
+  PortableMatrixBatchVectorMultiplyAccumulate(
       matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
       result_stride, per_channel_scale, input_offset);
 }
@@ -259,8 +259,8 @@ float VectorVectorDotProduct(const float* vector1, const float* vector2,
 void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
                                       const int16_t* vector2, int v_size,
                                       int n_batch, int32_t* result) {
-  return PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size,
-                                                  n_batch, result);
+  PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch,
+                                           result);
 }
 
 void VectorBatchVectorAdd(const float* vector, int v_size, int n_batch,

From 4249696badf668410832a3444712a730dcebde7e Mon Sep 17 00:00:00 2001
From: Reed Wanderman-Milne <reedwm@google.com>
Date: Thu, 20 Feb 2020 17:22:15 -0800
Subject: [PATCH 415/442] Do not crash if losses have different dtypes.

Also change mixed precision tests to no longer cast losses to float32, as this is no longer necessary, even when regularizers are used (which always are float32)

PiperOrigin-RevId: 296333364
Change-Id: I2f026815b6b71a701ad3975654cef1fe26ea0728
---
 .../python/keras/engine/compile_utils.py      |  5 +++
 .../python/keras/engine/training_test.py      | 45 +++++++++++++++++++
 tensorflow/python/keras/engine/training_v1.py | 28 ++++++------
 .../experimental/keras_test.py                | 13 ++----
 tensorflow/python/keras/utils/losses_utils.py | 27 +++++++++++
 5 files changed, 95 insertions(+), 23 deletions(-)

diff --git a/tensorflow/python/keras/engine/compile_utils.py b/tensorflow/python/keras/engine/compile_utils.py
index 74c6370fce6..85ea00dcffe 100644
--- a/tensorflow/python/keras/engine/compile_utils.py
+++ b/tensorflow/python/keras/engine/compile_utils.py
@@ -172,14 +172,19 @@ class LossesContainer(object):
       loss_metric_values.append(loss_metric_value)
 
     if regularization_losses:
+      regularization_losses = losses_utils.cast_losses_to_common_dtype(
+          regularization_losses)
       reg_loss = math_ops.add_n(regularization_losses)
       loss_metric_values.append(reg_loss)
       loss_values.append(losses_utils.scale_loss_for_distribution(reg_loss))
 
     if loss_values:
+      loss_metric_values = losses_utils.cast_losses_to_common_dtype(
+          loss_metric_values)
       total_loss_metric_value = math_ops.add_n(loss_metric_values)
       self._loss_metric.update_state(total_loss_metric_value)
 
+      loss_values = losses_utils.cast_losses_to_common_dtype(loss_values)
       total_loss = math_ops.add_n(loss_values)
       return total_loss
     else:
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 6ee8971d567..0a1d4e0d920 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -43,6 +43,7 @@ from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import np_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
@@ -1255,6 +1256,50 @@ class TrainingTest(keras_parameterized.TestCase):
     with self.assertRaisesRegex(RuntimeError, 'must compile your model'):
       model.fit(np.random.random((32, 1)), epochs=2)
 
+  @keras_parameterized.run_all_keras_modes
+  @testing_utils.enable_v2_dtype_behavior
+  def test_losses_of_different_dtypes(self):
+    inp = keras.Input(shape=(2,))
+    out_1 = keras.layers.Dense(2, dtype='float32', kernel_regularizer='l2')(inp)
+    out_2 = keras.layers.Dense(2, dtype='float16', kernel_regularizer='l2')(inp)
+    model = keras.Model(inp, [out_1, out_2])
+    extra_loss = math_ops.reduce_sum(math_ops.cast(out_2, 'float64'))
+    model.add_loss(extra_loss)
+    model.compile('sgd', ['mse', 'mse'],
+                  run_eagerly=testing_utils.should_run_eagerly())
+    x, y = np.ones((10, 2)), np.ones((10, 2))
+    model.fit(x, [y, y])
+
+  @keras_parameterized.run_all_keras_modes
+  @testing_utils.enable_v2_dtype_behavior
+  def test_losses_of_different_dtypes_with_subclassed_model(self):
+    class MyModel(keras.Model):
+
+      def build(self, _):
+        self.dense = keras.layers.Dense(2)
+
+      def call(self, inputs):
+        self.add_loss(math_ops.cast(nn_ops.l2_loss(inputs), 'float64'))
+        return self.dense(inputs)
+
+    model = MyModel(dtype='float32')
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    x, y = np.ones((10, 2)), np.ones((10, 2))
+    model.fit(x, y)
+
+  @keras_parameterized.run_all_keras_modes
+  @testing_utils.enable_v2_dtype_behavior
+  def test_regularizer_of_different_dtype(self):
+    inp = keras.Input(shape=(2,))
+    def regularizer(weight):
+      return math_ops.cast(nn_ops.l2_loss(weight), 'float64')
+    out = keras.layers.Dense(2, dtype='float32',
+                             kernel_regularizer=regularizer)(inp)
+    model = keras.Model(inp, out)
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    x, y = np.ones((10, 2)), np.ones((10, 2))
+    model.fit(x, y)
+
 
 class TestExceptionsAndWarnings(keras_parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/engine/training_v1.py b/tensorflow/python/keras/engine/training_v1.py
index 9261ab30889..1c0fea91337 100644
--- a/tensorflow/python/keras/engine/training_v1.py
+++ b/tensorflow/python/keras/engine/training_v1.py
@@ -1546,7 +1546,7 @@ class Model(training_lib.Model):
     if self.run_eagerly:
       raise TypeError('total loss can not be computed when compiled with '
                       'run_eagerly = True.')
-    total_loss = None
+    loss_list = []
     with K.name_scope('loss'):
       for endpoint, mask in zip(self._training_endpoints, masks):
         if endpoint.should_skip_target():
@@ -1605,23 +1605,25 @@ class Model(training_lib.Model):
         if loss_reduction == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE:
           output_loss = losses_utils.scale_loss_for_distribution(output_loss)
 
-        if total_loss is None:
-          total_loss = loss_weight * output_loss
-        else:
-          total_loss += loss_weight * output_loss
-      if total_loss is None:
-        if not self.losses:
-          raise ValueError('The model cannot be compiled '
-                           'because it has no loss to optimize.')
-        else:
-          total_loss = 0.
+        loss_list.append(loss_weight * output_loss)
+      if not loss_list and not self.losses:
+        raise ValueError('The model cannot be compiled '
+                         'because it has no loss to optimize.')
 
       # Add regularization penalties and other layer-specific losses.
       custom_losses = self.get_losses_for(None) + self.get_losses_for(
           self.inputs)
       if custom_losses:
-        total_loss += losses_utils.scale_loss_for_distribution(
-            math_ops.add_n(custom_losses))
+        total_custom_loss = math_ops.add_n(
+            losses_utils.cast_losses_to_common_dtype(custom_losses))
+        loss_list.append(
+            losses_utils.scale_loss_for_distribution(total_custom_loss))
+
+      loss_list = losses_utils.cast_losses_to_common_dtype(loss_list)
+      if loss_list:
+        total_loss = math_ops.add_n(loss_list)
+      else:
+        total_loss = 0.
     return total_loss
 
   def _get_callback_model(self):
diff --git a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
index 8ec8d914cf5..60396b05a17 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
@@ -525,10 +525,8 @@ class KerasModelTest(keras_parameterized.TestCase):
             input_shape=(1,))
         if use_input_spec:
           layer.input_spec = input_spec.InputSpec(shape=(2, 1))
-        cast_f32_layer = layers.Lambda(lambda x: math_ops.cast(x, 'float32'))
-        model = testing_utils.get_model_from_layers(
-            [layer, cast_f32_layer], input_shape=(1,),
-            input_dtype=dtypes.float16)
+        model = testing_utils.get_model_from_layers([layer], input_shape=(1,),
+                                                    input_dtype=dtypes.float16)
         if get_config:
           config = model.get_config()
           model = model.__class__.from_config(
@@ -712,12 +710,10 @@ class KerasModelTest(keras_parameterized.TestCase):
                   expected_dtype=dtypes.float16,
                   expected_gradient=[expected_gradient]))
           y = core.Lambda(identity_with_grad_check_fn)(y)
-        y = math_ops.cast(y, dtypes.float32)
         model = models.Model(inputs=x, outputs=y)
 
         def loss_fn(y_true, y_pred):
-          self.assertEqual(y_true.dtype, dtypes.float32)
-          self.assertEqual(y_pred.dtype, dtypes.float32)
+          del y_true
           return math_ops.reduce_mean(y_pred)
 
         opt = gradient_descent.SGD(learning_rate)
@@ -804,7 +800,6 @@ class KerasModelTest(keras_parameterized.TestCase):
                 expected_dtype=dtypes.float16,
                 expected_gradient=expected_gradient))
         y = core.Lambda(identity_with_grad_check_fn)(y)
-        y = math_ops.cast(y, dtypes.float32)
         model = models.Model(inputs=x, outputs=y)
         if get_config:
           config = model.get_config()
@@ -914,7 +909,6 @@ class KerasModelTest(keras_parameterized.TestCase):
         x = layers.Input(shape=(1,), batch_size=2)
         layer = mp_test_util.MultiplyLayer(assert_type=dtypes.float16)
         y = layer(x)
-        y = math_ops.cast(y, dtypes.float32)
         model = models.Model(inputs=x, outputs=y)
 
     model.set_weights([np.array(100.)])
@@ -960,7 +954,6 @@ class KerasModelTest(keras_parameterized.TestCase):
       layer = mp_test_util.MultiplyLayer(assert_type=dtypes.float16,
                                          var_name=var_name)
       y = layer(x)
-      y = math_ops.cast(y, dtypes.float32)
       model = models.Model(inputs=x, outputs=y)
       opt = gradient_descent.SGD(1., 1.)
       model.compile(
diff --git a/tensorflow/python/keras/utils/losses_utils.py b/tensorflow/python/keras/utils/losses_utils.py
index 5687d2a1c4b..e81058e3b70 100644
--- a/tensorflow/python/keras/utils/losses_utils.py
+++ b/tensorflow/python/keras/utils/losses_utils.py
@@ -119,3 +119,30 @@ def scale_loss_for_distribution(loss_value):
   if num_replicas > 1:
     loss_value *= (1. / num_replicas)
   return loss_value
+
+
+def cast_losses_to_common_dtype(losses):
+  """Cast a list of losses to a common dtype.
+
+  If any loss is floating-point, they will all be casted to the most-precise
+  floating-point loss. Otherwise the losses are not casted. We also skip casting
+  losses if there are any complex losses.
+
+  Args:
+    losses: A list of losses.
+
+  Returns:
+    `losses`, but they have been casted to a common dtype.
+  """
+  highest_float = None
+  for loss in losses:
+    if loss.dtype.is_floating:
+      if highest_float is None or loss.dtype.size > highest_float.size:
+        highest_float = loss.dtype
+      elif {loss.dtype, highest_float} == {'bfloat16', 'float16'}:
+        highest_float = 'float32'
+    if loss.dtype.is_complex:
+      return losses  # If we find any complex losses, do not cast any losses
+  if highest_float:
+    losses = [math_ops.cast(loss, highest_float) for loss in losses]
+  return losses

From db41177faada36347ad5e5401845fa9166da2b83 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Thu, 20 Feb 2020 17:33:09 -0800
Subject: [PATCH 416/442] Restore the docs directory that was accidentally
 removed.

PiperOrigin-RevId: 296334991
Change-Id: I6e2fe1a8d5f3e194dcd381e58d96bbb28c45580c
---
 tensorflow/compiler/mlir/g3doc/README.md      |  3 ++
 tensorflow/compiler/mlir/g3doc/_book.yaml     | 26 +++++++++
 tensorflow/compiler/mlir/g3doc/_index.yaml    | 54 +++++++++++++++++++
 tensorflow/compiler/mlir/g3doc/dialects.md    | 37 +++++++++++++
 .../compiler/mlir/g3doc/images/mlir-infra.svg |  1 +
 tensorflow/compiler/mlir/g3doc/overview.md    | 36 +++++++++++++
 6 files changed, 157 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/g3doc/README.md
 create mode 100644 tensorflow/compiler/mlir/g3doc/_book.yaml
 create mode 100644 tensorflow/compiler/mlir/g3doc/_index.yaml
 create mode 100644 tensorflow/compiler/mlir/g3doc/dialects.md
 create mode 100644 tensorflow/compiler/mlir/g3doc/images/mlir-infra.svg
 create mode 100644 tensorflow/compiler/mlir/g3doc/overview.md

diff --git a/tensorflow/compiler/mlir/g3doc/README.md b/tensorflow/compiler/mlir/g3doc/README.md
new file mode 100644
index 00000000000..39734828d19
--- /dev/null
+++ b/tensorflow/compiler/mlir/g3doc/README.md
@@ -0,0 +1,3 @@
+# TensorFlow MLIR
+
+These are the docs for: https://www.tensorflow.org/mlir
diff --git a/tensorflow/compiler/mlir/g3doc/_book.yaml b/tensorflow/compiler/mlir/g3doc/_book.yaml
new file mode 100644
index 00000000000..a75a2137536
--- /dev/null
+++ b/tensorflow/compiler/mlir/g3doc/_book.yaml
@@ -0,0 +1,26 @@
+upper_tabs:
+# Tabs left of dropdown menu
+- include: /_upper_tabs_left.yaml
+- include: /api_docs/_upper_tabs_api.yaml
+# Dropdown menu
+- name: Resources
+  path: /resources
+  is_default: true
+  menu:
+  - include: /resources/_menu_toc.yaml
+  lower_tabs:
+    # Subsite tabs
+    other:
+    - name: Guide
+      contents:
+      - title: Overview
+        path: /mlir/overview
+      - heading: Dialects
+      - title: Overview
+        path: /mlir/dialects
+      - title: TensorFlow
+        path: /mlir/tf_ops
+      - title: TensorFlow Lite
+        path: /mlir/tfl_ops
+
+- include: /_upper_tabs_right.yaml
diff --git a/tensorflow/compiler/mlir/g3doc/_index.yaml b/tensorflow/compiler/mlir/g3doc/_index.yaml
new file mode 100644
index 00000000000..affd0926af5
--- /dev/null
+++ b/tensorflow/compiler/mlir/g3doc/_index.yaml
@@ -0,0 +1,54 @@
+book_path: /mlir/_book.yaml
+project_path: /mlir/_project.yaml
+description: <!--no description-->
+landing_page:
+  custom_css_path: /site-assets/css/style.css
+  rows:
+  - heading: MLIR unifies the infrastructure for high-performance ML models in TensorFlow.
+    items:
+    - description: >
+        The <a href="https://mlir.llvm.org/" class="external">MLIR</a> project defines a common
+        intermediate representation (IR) that unifies the infrastructure required to execute high
+        performance machine learning models in TensorFlow and similar ML frameworks. This project
+        will include the application of HPC techniques, along with integration of
+        search algorithms like reinforcement learning. MLIR aims to reduce the
+        cost to bring up new hardware, and improve usability for existing
+        TensorFlow users.
+
+    - code_block: |
+        <pre class = "prettyprint">
+        // Syntactically similar to LLVM:
+        func @testFunction(%arg0: i32) {
+          %x = call @thingToCall(%arg0) : (i32) -> i32
+          br ^bb1
+        ^bb1:
+          %y = addi %x, %x : i32
+          return %y : i32
+        }
+        </pre>
+
+  - classname: devsite-landing-row-cards
+    items:
+    - heading: "Multi-Level Intermediate Representation for Compiler Infrastructure"
+      youtube_id: qzljG6DKgic
+      buttons:
+      - label: Watch the video
+        path: https://www.youtube.com/watch?v=qzljG6DKgic
+    - heading: "A new intermediate representation and compiler framework"
+      image_path: /resources/images/tf-logo-card-16x9.png
+      path: https://blog.tensorflow.org/2019/04/mlir-new-intermediate-representation.html
+      buttons:
+      - label: Read on TensorFlow blog
+        path: https://blog.tensorflow.org/2019/04/mlir-new-intermediate-representation.html
+    - heading: MLIR on GitHub
+      image_path: /resources/images/github-card-16x9.png
+      path: https://github.com/llvm/llvm-project/tree/master/mlir
+      buttons:
+      - label: View on GitHub
+        path: https://github.com/llvm/llvm-project/tree/master/mlir
+    - heading: TensorFlow MLIR on GitHub
+      image_path: /resources/images/github-card-16x9.png
+      path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/mlir
+      buttons:
+      - label: View on GitHub
+        path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/mlir
diff --git a/tensorflow/compiler/mlir/g3doc/dialects.md b/tensorflow/compiler/mlir/g3doc/dialects.md
new file mode 100644
index 00000000000..fa6c4605b27
--- /dev/null
+++ b/tensorflow/compiler/mlir/g3doc/dialects.md
@@ -0,0 +1,37 @@
+# MLIR dialects
+
+## Overview
+
+
+To separate different hardware and software targets, MLIR has “dialects”,
+including:
+
+* TensorFlow IR, which represents all things possible in TensorFlow graphs.
+* XLA HLO IR, which is designed to take advantage of XLA’s compilation
+  abilities (with output to, among other things, TPUs).
+* An experimental affine dialect, which focuses on
+  [polyhedral representations](https://en.wikipedia.org/wiki/Polytope_model)
+  and optimizations.
+* LLVM IR, which has a 1:1 mapping between it and LLVM’s own representation,
+  allowing MLIR to emit GPU and CPU code through LLVM.
+* TensorFlow Lite, which will translate to running code on mobile platforms.
+
+Each dialect consists of a set of defined operations which have invariants
+placed on them, like: “This is a binary operator, and the inputs and outputs
+have the same types.”
+
+## Adding to MLIR
+
+MLIR has no fixed/built-in list of globally known operations (no “intrinsics”).
+Dialects can define entirely custom types, which is how MLIR can model things
+like the LLVM IR type system (which has first class aggregates), domain
+abstractions important for ML-optimized accelerators like quantized types, and
+even the Swift or Clang type systems (which are built around Swift/Clang
+declaration nodes) in the future.
+
+If you want to connect a new low-level compiler, you would create a new dialect
+and the lowerings between the TensorFlow Graph dialect and your dialect.
+This smooths the path for hardware and compiler makers. You can even target
+dialects at different levels in the same model; the higher-level optimizers
+will respect the unfamiliar parts of the IR and wait for a lower level to handle
+it.
diff --git a/tensorflow/compiler/mlir/g3doc/images/mlir-infra.svg b/tensorflow/compiler/mlir/g3doc/images/mlir-infra.svg
new file mode 100644
index 00000000000..aec0986ba02
--- /dev/null
+++ b/tensorflow/compiler/mlir/g3doc/images/mlir-infra.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 960.0 720.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l960.0 0l0 720.0l-960.0 0l0 -720.0z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l960.0 0l0 720.0l-960.0 0z" fill-rule="evenodd"/><path fill="#434343" d="m8.131233 225.34946l0 0c0 -10.332718 8.376322 -18.70903 18.709036 -18.70903l74.83389 0l0 0c4.9619446 0 9.720665 1.9711151 13.229286 5.4797363c3.5086288 3.5086365 5.4797516 8.267349 5.4797516 13.229294l0 394.14096c0 10.332764 -8.37632 18.709045 -18.709038 18.709045l-74.83389 0c-10.332714 0 -18.709036 -8.376282 -18.709036 -18.709045z" fill-rule="evenodd"/><path fill="#ffffff" d="m55.68145 377.67307l2.3125 0l2.53125 6.703125l0.109375 0l2.53125 -6.703125l2.34375 0l0 9.546875l-1.796875 0l0 -4.703125l0.109375 -1.59375l-0.109375 0l-2.421875 6.296875l-1.40625 0l-2.4375 -6.296875l-0.09375 0l0.09375 1.59375l0 4.703125l-1.765625 0l0 -9.546875zm11.99527 0l1.796875 0l0 7.84375l4.046875 0l0 1.703125l-5.84375 0l0 -9.546875zm6.2500153 9.546875l0 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m25.853401 393.67307l6.046875 0l0 1.703125l-4.25 0l0 2.328125l3.828125 0l0 1.703125l-3.828125 0l0 3.8125l-1.796875 0l0 -9.546875zm7.135956 2.75l1.640625 0l0 0.90625l0.09375 0q0.265625 -0.484375 0.796875 -0.796875q0.53125 -0.3125 1.171875 -0.3125q0.46875 0 0.84375 0.15625l0 1.734375q-0.359375 -0.125 -0.609375 -0.1875q-0.234375 -0.0625 -0.515625 -0.0625q-0.78125 0 -1.234375 0.5625q-0.453125 0.5625 -0.453125 1.40625l0 3.390625l-1.734375 0l0 -6.796875zm7.5558777 7.015625q-1.078125 0 -1.78125 -0.625q-0.6875 -0.640625 -0.6875 -1.671875q0 -0.6875 0.359375 -1.203125q0.359375 -0.515625 0.984375 -0.796875q0.625 -0.28125 1.390625 -0.28125q1.046875 0 1.796875 0.3125l0 -0.296875q0 -0.5625 -0.421875 -0.90625q-0.421875 -0.34375 -1.125 -0.34375q-0.484375 0 -0.9375 0.21875q-0.4375 0.21875 -0.734375 0.578125l-1.109375 -0.875q0.484375 -0.640625 1.234375 -0.984375q0.75 -0.359375 1.625 -0.359375q1.5625 0 2.375 0.71875q0.8125 0.71875 0.8125 2.109375l0 4.1875l-1.71875 0l0 -0.6875l-0.109375 0q-0.3125 0.390625 -0.8125 0.65625q-0.5 0.25 -1.140625 0.25zm0.40625 -1.359375q0.765625 0 1.203125 -0.484375q0.453125 -0.5 0.453125 -1.140625q-0.6875 -0.328125 -1.421875 -0.328125q-1.359375 0 -1.359375 1.015625q0 0.40625 0.28125 0.671875q0.296875 0.265625 0.84375 0.265625zm4.992676 -5.65625l1.640625 0l0 0.859375l0.09375 0q0.328125 -0.5 0.859375 -0.78125q0.546875 -0.296875 1.21875 -0.296875q0.71875 0 1.25 0.34375q0.53125 0.34375 0.75 0.859375q0.34375 -0.53125 0.921875 -0.859375q0.578125 -0.34375 1.359375 -0.34375q1.15625 0 1.765625 0.71875q0.609375 0.71875 0.609375 1.921875l0 4.375l-1.734375 0l0 -4.046875q0 -0.640625 -0.296875 -0.984375q-0.296875 -0.359375 -0.828125 -0.359375q-0.6875 0 -1.09375 0.546875q-0.40625 0.53125 -0.40625 1.421875l0 3.421875l-1.75 0l0 -4.046875q0 -0.640625 -0.3125 -0.984375q-0.3125 -0.359375 -0.875 -0.359375q-0.640625 0 -1.046875 0.546875q-0.390625 0.53125 -0.390625 1.421875l0 3.421875l-1.734375 0l0 -6.796875zm15.313751 7.015625q-1.0 0 -1.8125 -0.46875q-0.8125 -0.46875 -1.28125 -1.28125q-0.453125 -0.828125 -0.453125 -1.859375q0 -0.96875 0.453125 -1.796875q0.453125 -0.84375 1.25 -1.328125q0.796875 -0.5 1.78125 -0.5q1.0625 0 1.8125 0.453125q0.765625 0.453125 1.15625 1.25q0.390625 0.78125 0.390625 1.734375q0 0.3125 -0.03125 0.59375l-5.109375 0q0.125 0.8125 0.640625 1.234375q0.515625 0.421875 1.234375 0.421875q0.609375 0 1.03125 -0.265625q0.4375 -0.265625 0.6875 -0.703125l1.421875 0.703125q-1.046875 1.8125 -3.171875 1.8125zm1.625 -4.4375q-0.03125 -0.328125 -0.234375 -0.640625q-0.203125 -0.3125 -0.578125 -0.515625q-0.359375 -0.203125 -0.859375 -0.203125q-0.625 0 -1.078125 0.375q-0.4375 0.359375 -0.640625 0.984375l3.390625 0zm2.1277466 -2.578125l1.953125 0l1.171875 4.40625l0.09375 0l1.359375 -4.40625l1.796875 0l1.359375 4.40625l0.09375 0l1.171875 -4.40625l1.921875 0l-2.171875 6.796875l-1.875 0l-1.390625 -4.390625l-0.09375 0l-1.359375 4.390625l-1.875 0l-2.15625 -6.796875zm14.984573 7.015625q-1.0625 0 -1.890625 -0.46875q-0.8125 -0.484375 -1.28125 -1.296875q-0.453125 -0.828125 -0.453125 -1.859375q0 -1.0 0.453125 -1.828125q0.46875 -0.828125 1.28125 -1.296875q0.828125 -0.484375 1.890625 -0.484375q1.03125 0 1.859375 0.484375q0.828125 0.46875 1.28125 1.296875q0.46875 0.828125 0.46875 1.828125q0 1.03125 -0.46875 1.859375q-0.453125 0.8125 -1.28125 1.296875q-0.828125 0.46875 -1.859375 0.46875zm0 -1.625q0.5 0 0.921875 -0.234375q0.4375 -0.234375 0.6875 -0.6875q0.25 -0.453125 0.25 -1.078125q0 -0.59375 -0.25 -1.046875q-0.25 -0.453125 -0.6875 -0.6875q-0.421875 -0.25 -0.921875 -0.25q-0.515625 0 -0.9375 0.25q-0.421875 0.234375 -0.6875 0.6875q-0.265625 0.453125 -0.265625 1.046875q0 0.609375 0.265625 1.078125q0.265625 0.453125 0.6875 0.6875q0.421875 0.234375 0.9375 0.234375zm4.813431 -5.390625l1.640625 0l0 0.90625l0.09375 0q0.265625 -0.484375 0.796875 -0.796875q0.53125 -0.3125 1.171875 -0.3125q0.46875 0 0.84375 0.15625l0 1.734375q-0.359375 -0.125 -0.609375 -0.1875q-0.234375 -0.0625 -0.515625 -0.0625q-0.78125 0 -1.234375 0.5625q-0.453125 0.5625 -0.453125 1.40625l0 3.390625l-1.734375 0l0 -6.796875zm5.7577515 -2.75l1.734375 0l0 5.203125l0.09375 0l2.4375 -2.453125l2.15625 0l0 0.109375l-2.578125 2.515625l2.6875 4.0625l0 0.109375l-2.0625 0l-1.84375 -2.96875l-0.890625 0.859375l0 2.109375l-1.734375 0l0 -9.546875zm9.895187 9.765625q-1.234375 0 -2.0 -0.5q-0.765625 -0.515625 -1.078125 -1.328125l1.5625 -0.6875q0.203125 0.484375 0.59375 0.734375q0.40625 0.25 0.921875 0.25q0.484375 0 0.796875 -0.15625q0.328125 -0.15625 0.328125 -0.5q0 -0.34375 -0.296875 -0.5q-0.296875 -0.171875 -0.875 -0.296875l-0.8125 -0.1875q-0.8125 -0.1875 -1.359375 -0.71875q-0.546875 -0.53125 -0.546875 -1.3125q0 -0.59375 0.34375 -1.046875q0.359375 -0.46875 0.953125 -0.71875q0.609375 -0.265625 1.34375 -0.265625q2.109375 0 2.8125 1.484375l-1.484375 0.65625q-0.40625 -0.71875 -1.28125 -0.71875q-0.453125 0 -0.71875 0.171875q-0.265625 0.15625 -0.265625 0.40625q0 0.484375 0.90625 0.71875l1.0 0.25q1.015625 0.25 1.515625 0.765625q0.515625 0.515625 0.515625 1.28125q0 0.65625 -0.375 1.15625q-0.375 0.5 -1.046875 0.78125q-0.65625 0.28125 -1.453125 0.28125z" fill-rule="nonzero"/><path fill="#ffffff" d="m54.497314 425.67307l5.515625 0l0 1.078125l-4.375 0l0 3.171875l3.953125 0l0 1.0625l-3.953125 0l0 3.15625l4.375 0l0 1.078125l-5.515625 0l0 -9.546875zm7.8750153 9.625q-0.375 0 -0.625 -0.234375q-0.234375 -0.25 -0.234375 -0.609375q0 -0.359375 0.234375 -0.59375q0.25 -0.25 0.625 -0.25q0.359375 0 0.59375 0.25q0.25 0.234375 0.25 0.59375q0 0.359375 -0.25 0.609375q-0.234375 0.234375 -0.59375 0.234375zm5.4064636 3.015625q-1.203125 0 -1.984375 -0.5625q-0.78125 -0.5625 -1.046875 -1.34375l1.046875 -0.4375q0.203125 0.578125 0.734375 0.9375q0.53125 0.375 1.25 0.375q1.046875 0 1.625 -0.609375q0.578125 -0.609375 0.578125 -1.734375l0 -0.765625l-0.046875 0q-0.328125 0.53125 -0.9375 0.859375q-0.609375 0.328125 -1.375 0.328125q-0.890625 0 -1.625 -0.453125q-0.71875 -0.453125 -1.140625 -1.265625q-0.421875 -0.828125 -0.421875 -1.859375q0 -1.03125 0.421875 -1.84375q0.421875 -0.828125 1.140625 -1.28125q0.734375 -0.453125 1.625 -0.453125q0.765625 0 1.375 0.328125q0.609375 0.328125 0.9375 0.859375l0.046875 0l0 -0.96875l1.09375 0l0 6.53125q0 1.640625 -0.921875 2.5q-0.90625 0.859375 -2.375 0.859375zm0 -3.984375q0.609375 0 1.109375 -0.296875q0.5 -0.3125 0.796875 -0.890625q0.296875 -0.578125 0.296875 -1.359375q0 -0.796875 -0.296875 -1.375q-0.296875 -0.578125 -0.796875 -0.875q-0.5 -0.296875 -1.109375 -0.296875q-0.609375 0 -1.125 0.3125q-0.5 0.296875 -0.8125 0.875q-0.296875 0.578125 -0.296875 1.359375q0 0.78125 0.296875 1.359375q0.3125 0.578125 0.8125 0.890625q0.515625 0.296875 1.125 0.296875zm5.7224884 0.96875q-0.375 0 -0.625 -0.234375q-0.234375 -0.25 -0.234375 -0.609375q0 -0.359375 0.234375 -0.59375q0.25 -0.25 0.625 -0.25q0.359375 0 0.59375 0.25q0.25 0.234375 0.25 0.59375q0 0.359375 -0.25 0.609375q-0.234375 0.234375 -0.59375 0.234375zm1.5783386 -0.078125l0 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m31.78949 442.7512l-2.6875 0l0 -1.078125l6.484375 0l0 1.078125l-2.671875 0l0 8.46875l-1.125 0l0 -8.46875zm6.931961 8.6875q-0.96875 0 -1.75 -0.46875q-0.765625 -0.46875 -1.203125 -1.28125q-0.421875 -0.828125 -0.421875 -1.859375q0 -0.96875 0.40625 -1.796875q0.40625 -0.84375 1.15625 -1.328125q0.75 -0.5 1.71875 -0.5q1.0 0 1.734375 0.453125q0.734375 0.4375 1.125 1.234375q0.40625 0.78125 0.40625 1.796875q0 0.15625 -0.03125 0.34375l-5.390625 0q0.046875 0.78125 0.375 1.3125q0.328125 0.53125 0.84375 0.796875q0.515625 0.265625 1.078125 0.265625q1.328125 0 2.015625 -1.234375l0.953125 0.46875q-0.421875 0.8125 -1.1875 1.3125q-0.75 0.484375 -1.828125 0.484375zm1.96875 -4.34375q-0.03125 -0.421875 -0.25 -0.84375q-0.203125 -0.421875 -0.65625 -0.71875q-0.453125 -0.296875 -1.15625 -0.296875q-0.796875 0 -1.359375 0.515625q-0.546875 0.515625 -0.734375 1.34375l4.15625 0zm2.519333 -2.671875l1.078125 0l0 1.0l0.046875 0q0.28125 -0.5 0.890625 -0.859375q0.625 -0.359375 1.3125 -0.359375q1.21875 0 1.859375 0.71875q0.65625 0.71875 0.65625 1.9375l0 4.359375l-1.125 0l0 -4.1875q0 -0.9375 -0.453125 -1.359375q-0.453125 -0.4375 -1.25 -0.4375q-0.546875 0 -0.984375 0.3125q-0.421875 0.296875 -0.671875 0.796875q-0.234375 0.5 -0.234375 1.046875l0 3.828125l-1.125 0l0 -6.796875zm9.785385 7.015625q-1.09375 0 -1.828125 -0.515625q-0.734375 -0.515625 -1.046875 -1.3125l1.015625 -0.453125q0.25 0.609375 0.75 0.953125q0.5 0.328125 1.109375 0.328125q0.640625 0 1.09375 -0.265625q0.453125 -0.265625 0.453125 -0.734375q0 -0.4375 -0.375 -0.6875q-0.359375 -0.265625 -1.125 -0.453125l-0.828125 -0.203125q-0.8125 -0.203125 -1.328125 -0.6875q-0.515625 -0.484375 -0.515625 -1.234375q0 -0.59375 0.34375 -1.03125q0.359375 -0.453125 0.9375 -0.6875q0.59375 -0.25 1.265625 -0.25q0.890625 0 1.59375 0.390625q0.703125 0.390625 1.0 1.078125l-0.984375 0.453125q-0.453125 -0.90625 -1.625 -0.90625q-0.546875 0 -0.96875 0.265625q-0.421875 0.25 -0.421875 0.65625q0 0.375 0.28125 0.609375q0.296875 0.234375 0.890625 0.375l0.984375 0.25q1.0 0.265625 1.5 0.765625q0.515625 0.5 0.515625 1.21875q0 0.625 -0.359375 1.09375q-0.359375 0.46875 -0.984375 0.734375q-0.609375 0.25 -1.34375 0.25zm7.1208496 0q-1.0 0 -1.796875 -0.484375q-0.796875 -0.484375 -1.25 -1.3125q-0.4375 -0.828125 -0.4375 -1.828125q0 -0.984375 0.4375 -1.8125q0.453125 -0.828125 1.25 -1.3125q0.796875 -0.484375 1.796875 -0.484375q1.0 0 1.796875 0.484375q0.8125 0.484375 1.25 1.3125q0.453125 0.828125 0.453125 1.8125q0 1.0 -0.453125 1.828125q-0.4375 0.828125 -1.25 1.3125q-0.796875 0.484375 -1.796875 0.484375zm0 -1.03125q0.625 0 1.15625 -0.3125q0.546875 -0.3125 0.875 -0.890625q0.328125 -0.59375 0.328125 -1.390625q0 -0.78125 -0.328125 -1.359375q-0.328125 -0.59375 -0.875 -0.90625q-0.53125 -0.3125 -1.15625 -0.3125q-0.625 0 -1.171875 0.3125q-0.546875 0.3125 -0.875 0.90625q-0.328125 0.578125 -0.328125 1.359375q0 0.796875 0.328125 1.390625q0.328125 0.578125 0.875 0.890625q0.546875 0.3125 1.171875 0.3125zm4.685196 -5.984375l1.078125 0l0 1.09375l0.046875 0q0.203125 -0.5625 0.765625 -0.921875q0.578125 -0.375 1.203125 -0.375q0.46875 0 0.8125 0.140625l0 1.21875q-0.4375 -0.203125 -0.96875 -0.203125q-0.484375 0 -0.90625 0.28125q-0.40625 0.265625 -0.65625 0.75q-0.25 0.46875 -0.25 1.015625l0 3.796875l-1.125 0l0 -6.796875zm5.2167816 -2.75l5.515625 0l0 1.078125l-4.375 0l0 3.265625l3.953125 0l0 1.078125l-3.953125 0l0 4.125l-1.140625 0l0 -9.546875zm6.8584595 0l1.125 0l0 9.546875l-1.125 0l0 -9.546875zm5.8705597 9.765625q-1.0 0 -1.796875 -0.484375q-0.796875 -0.484375 -1.25 -1.3125q-0.4375 -0.828125 -0.4375 -1.828125q0 -0.984375 0.4375 -1.8125q0.453125 -0.828125 1.25 -1.3125q0.796875 -0.484375 1.796875 -0.484375q1.0 0 1.796875 0.484375q0.8125 0.484375 1.25 1.3125q0.453125 0.828125 0.453125 1.8125q0 1.0 -0.453125 1.828125q-0.4375 0.828125 -1.25 1.3125q-0.796875 0.484375 -1.796875 0.484375zm0 -1.03125q0.625 0 1.15625 -0.3125q0.546875 -0.3125 0.875 -0.890625q0.328125 -0.59375 0.328125 -1.390625q0 -0.78125 -0.328125 -1.359375q-0.328125 -0.59375 -0.875 -0.90625q-0.53125 -0.3125 -1.15625 -0.3125q-0.625 0 -1.171875 0.3125q-0.546875 0.3125 -0.875 0.90625q-0.328125 0.578125 -0.328125 1.359375q0 0.796875 0.328125 1.390625q0.328125 0.578125 0.875 0.890625q0.546875 0.3125 1.171875 0.3125zm3.935196 -5.984375l1.171875 0l1.609375 5.375l0.015625 0l1.71875 -5.375l1.15625 0l1.71875 5.359375l0.015625 0l1.609375 -5.359375l1.15625 0l-2.203125 6.796875l-1.140625 0l-1.765625 -5.421875l-1.75 5.421875l-1.125 0l-2.1875 -6.796875zm10.549667 7.90625l0.71875 -1.109375q-0.25 -0.046875 -0.453125 -0.25q-0.1875 -0.21875 -0.1875 -0.515625q0 -0.359375 0.25 -0.59375q0.25 -0.25 0.609375 -0.25q0.359375 0 0.609375 0.25q0.25 0.234375 0.25 0.59375q0 0.21875 -0.09375 0.421875q-0.09375 0.203125 -0.28125 0.484375l-0.890625 1.328125l-0.53125 -0.359375zm2.5245667 -1.109375l0 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m30.26062 457.67307l3.21875 0q0.796875 0 1.46875 0.359375q0.6875 0.359375 1.09375 1.015625q0.40625 0.640625 0.40625 1.453125q0 0.8125 -0.40625 1.46875q-0.40625 0.640625 -1.09375 1.0q-0.671875 0.359375 -1.46875 0.359375l-2.078125 0l0 3.890625l-1.140625 0l0 -9.546875zm3.25 4.578125q0.53125 0 0.9375 -0.25q0.40625 -0.265625 0.625 -0.65625q0.234375 -0.40625 0.234375 -0.84375q0 -0.4375 -0.234375 -0.828125q-0.21875 -0.40625 -0.625 -0.65625q-0.40625 -0.265625 -0.9375 -0.265625l-2.109375 0l0 3.5l2.109375 0zm6.2863464 4.546875l-2.8125 -6.375l1.21875 0l2.140625 5.046875l0.015625 0l2.0625 -5.046875l1.21875 0l-4.1875 9.6719055l-1.171875 0l1.515625 -3.2969055zm6.986664 -8.046875l-2.6875 0l0 -1.078125l6.484375 0l0 1.078125l-2.671875 0l0 8.46875l-1.125 0l0 -8.46875zm7.041336 8.6875q-1.0 0 -1.796875 -0.484375q-0.796875 -0.484375 -1.25 -1.3125q-0.4375 -0.828125 -0.4375 -1.828125q0 -0.984375 0.4375 -1.8125q0.453125 -0.828125 1.25 -1.3125q0.796875 -0.484375 1.796875 -0.484375q1.0 0 1.796875 0.484375q0.8125 0.484375 1.25 1.3125q0.453125 0.828125 0.453125 1.8125q0 1.0 -0.453125 1.828125q-0.4375 0.828125 -1.25 1.3125q-0.796875 0.484375 -1.796875 0.484375zm0 -1.03125q0.625 0 1.15625 -0.3125q0.546875 -0.3125 0.875 -0.890625q0.328125 -0.59375 0.328125 -1.390625q0 -0.78125 -0.328125 -1.359375q-0.328125 -0.59375 -0.875 -0.90625q-0.53125 -0.3125 -1.15625 -0.3125q-0.625 0 -1.171875 0.3125q-0.546875 0.3125 -0.875 0.90625q-0.328125 0.578125 -0.328125 1.359375q0 0.796875 0.328125 1.390625q0.328125 0.578125 0.875 0.890625q0.546875 0.3125 1.171875 0.3125zm4.685196 -5.984375l1.078125 0l0 1.09375l0.046875 0q0.203125 -0.5625 0.765625 -0.921875q0.578125 -0.375 1.203125 -0.375q0.46875 0 0.8125 0.140625l0 1.21875q-0.4375 -0.203125 -0.96875 -0.203125q-0.484375 0 -0.90625 0.28125q-0.40625 0.265625 -0.65625 0.75q-0.25 0.46875 -0.25 1.015625l0 3.796875l-1.125 0l0 -6.796875zm7.6761627 7.015625q-0.984375 0 -1.78125 -0.46875q-0.78125 -0.484375 -1.21875 -1.296875q-0.4375 -0.828125 -0.4375 -1.859375q0 -1.015625 0.4375 -1.84375q0.4375 -0.828125 1.21875 -1.296875q0.796875 -0.46875 1.78125 -0.46875q1.109375 0 1.859375 0.515625q0.75 0.515625 1.0625 1.375l-1.015625 0.421875q-0.25 -0.625 -0.75 -0.953125q-0.5 -0.328125 -1.21875 -0.328125q-0.59375 0 -1.125 0.328125q-0.515625 0.328125 -0.828125 0.921875q-0.3125 0.578125 -0.3125 1.328125q0 0.765625 0.3125 1.359375q0.3125 0.578125 0.828125 0.90625q0.53125 0.328125 1.125 0.328125q0.71875 0 1.234375 -0.328125q0.53125 -0.34375 0.78125 -0.953125l1.015625 0.421875q-0.34375 0.84375 -1.109375 1.375q-0.765625 0.515625 -1.859375 0.515625zm4.199005 -9.765625l1.125 0l0 2.8125l-0.046875 0.9375l0.046875 0q0.28125 -0.515625 0.890625 -0.859375q0.609375 -0.359375 1.328125 -0.359375q1.234375 0 1.890625 0.734375q0.65625 0.71875 0.65625 1.921875l0 4.359375l-1.140625 0l0 -4.1875q0 -0.921875 -0.46875 -1.359375q-0.453125 -0.4375 -1.203125 -0.4375q-0.53125 0 -0.984375 0.328125q-0.4375 0.3125 -0.703125 0.828125q-0.265625 0.5 -0.265625 1.046875l0 3.78125l-1.125 0l0 -9.546875zm6.673279 9.546875l0 0zm6.9514923 0.21875q-0.96875 0 -1.75 -0.46875q-0.765625 -0.46875 -1.203125 -1.28125q-0.421875 -0.828125 -0.421875 -1.859375q0 -0.96875 0.40625 -1.796875q0.40625 -0.84375 1.15625 -1.328125q0.75 -0.5 1.71875 -0.5q1.0 0 1.734375 0.453125q0.734375 0.4375 1.125 1.234375q0.40625 0.78125 0.40625 1.796875q0 0.15625 -0.03125 0.34375l-5.390625 0q0.046875 0.78125 0.375 1.3125q0.328125 0.53125 0.84375 0.796875q0.515625 0.265625 1.078125 0.265625q1.328125 0 2.015625 -1.234375l0.953125 0.46875q-0.421875 0.8125 -1.1875 1.3125q-0.75 0.484375 -1.828125 0.484375zm1.96875 -4.34375q-0.03125 -0.421875 -0.25 -0.84375q-0.203125 -0.421875 -0.65625 -0.71875q-0.453125 -0.296875 -1.15625 -0.296875q-0.796875 0 -1.359375 0.515625q-0.546875 0.515625 -0.734375 1.34375l4.15625 0zm4.928711 4.234375q-0.421875 0 -0.78125 -0.125q-0.34375 -0.140625 -0.578125 -0.375q-0.53125 -0.5 -0.53125 -1.390625l0 -3.984375l-1.1875 0l0 -1.03125l1.1875 0l0 -1.921875l1.125 0l0 1.921875l1.671875 0l0 1.03125l-1.671875 0l0 3.703125q0 0.5625 0.21875 0.828125q0.25 0.28125 0.734375 0.28125q0.40625 0 0.75 -0.21875l0 1.109375q-0.203125 0.09375 -0.421875 0.125q-0.203125 0.046875 -0.515625 0.046875zm5.0534973 0.109375q-0.984375 0 -1.78125 -0.46875q-0.78125 -0.484375 -1.21875 -1.296875q-0.4375 -0.828125 -0.4375 -1.859375q0 -1.015625 0.4375 -1.84375q0.4375 -0.828125 1.21875 -1.296875q0.796875 -0.46875 1.78125 -0.46875q1.109375 0 1.859375 0.515625q0.75 0.515625 1.0625 1.375l-1.015625 0.421875q-0.25 -0.625 -0.75 -0.953125q-0.5 -0.328125 -1.21875 -0.328125q-0.59375 0 -1.125 0.328125q-0.515625 0.328125 -0.828125 0.921875q-0.3125 0.578125 -0.3125 1.328125q0 0.765625 0.3125 1.359375q0.3125 0.578125 0.828125 0.90625q0.53125 0.328125 1.125 0.328125q0.71875 0 1.234375 -0.328125q0.53125 -0.34375 0.78125 -0.953125l1.015625 0.421875q-0.34375 0.84375 -1.109375 1.375q-0.765625 0.515625 -1.859375 0.515625zm3.3552551 -0.21875l0 0z" fill-rule="nonzero"/><path fill="#4285f4" d="m158.4755 221.85857l0 0c0 -8.404755 6.8134003 -15.21814 15.218155 -15.21814l126.57158 0c4.0361023 0 7.906891 1.6033325 10.760864 4.4572906c2.8539429 2.8539581 4.4572754 6.7247467 4.4572754 10.760849l0 60.870773c0 8.404755 -6.813385 15.21817 -15.21814 15.21817l-126.57158 0c-8.404755 0 -15.218155 -6.8134155 -15.218155 -15.21817z" fill-rule="evenodd"/><path fill="#ffffff" d="m216.92953 248.51083q-1.921875 0 -3.53125 -0.90625q-1.609375 -0.90625 -2.5625 -2.5q-0.953125 -1.59375 -0.953125 -3.5625q0 -1.984375 0.953125 -3.578125q0.953125 -1.59375 2.5625 -2.5q1.609375 -0.90625 3.53125 -0.90625q1.578125 0 2.875 0.546875q1.296875 0.546875 2.21875 1.5625l-1.734375 1.703125q-0.671875 -0.71875 -1.484375 -1.0625q-0.8125 -0.359375 -1.890625 -0.359375q-1.234375 0 -2.265625 0.578125q-1.015625 0.5625 -1.625 1.625q-0.609375 1.046875 -0.609375 2.390625q0 1.34375 0.609375 2.390625q0.625 1.046875 1.65625 1.625q1.046875 0.5625 2.265625 0.5625q1.875 0 3.0625 -1.09375q0.375 -0.34375 0.65625 -0.859375q0.28125 -0.53125 0.40625 -1.140625l-4.1875 0l0 -2.140625l6.546875 0q0.125 0.515625 0.125 1.171875q0 1.328125 -0.40625 2.46875q-0.390625 1.125 -1.21875 1.984375q-0.890625 0.96875 -2.15625 1.484375q-1.265625 0.515625 -2.84375 0.515625zm8.317978 -9.8125l2.296875 0l0 1.265625l0.15625 0q0.359375 -0.671875 1.09375 -1.109375q0.75 -0.4375 1.640625 -0.4375q0.65625 0 1.203125 0.203125l0 2.453125q-0.5 -0.1875 -0.859375 -0.265625q-0.34375 -0.09375 -0.734375 -0.09375q-1.078125 0 -1.71875 0.78125q-0.625 0.78125 -0.625 1.984375l0 4.734375l-2.453125 0l0 -9.515625zm10.589432 9.8125q-1.515625 0 -2.484375 -0.875q-0.96875 -0.890625 -0.96875 -2.328125q0 -0.953125 0.5 -1.671875q0.5 -0.734375 1.375 -1.125q0.890625 -0.390625 1.953125 -0.390625q1.46875 0 2.515625 0.421875l0 -0.40625q0 -0.78125 -0.59375 -1.265625q-0.578125 -0.484375 -1.578125 -0.484375q-0.671875 0 -1.296875 0.3125q-0.625 0.296875 -1.03125 0.796875l-1.5625 -1.234375q0.6875 -0.890625 1.734375 -1.375q1.046875 -0.484375 2.265625 -0.484375q2.1875 0 3.328125 1.015625q1.140625 1.0 1.140625 2.9375l0 5.859375l-2.40625 0l0 -0.96875l-0.15625 0q-0.421875 0.5625 -1.125 0.921875q-0.703125 0.34375 -1.609375 0.34375zm0.578125 -1.90625q1.0625 0 1.6875 -0.671875q0.625 -0.6875 0.625 -1.59375q-0.953125 -0.453125 -2.0 -0.453125q-1.90625 0 -1.90625 1.421875q0 0.578125 0.40625 0.9375q0.421875 0.359375 1.1875 0.359375zm6.903656 -7.90625l2.296875 0l0 1.15625l0.140625 0q0.390625 -0.609375 1.140625 -1.03125q0.75 -0.421875 1.71875 -0.421875q1.28125 0 2.328125 0.640625q1.0625 0.640625 1.65625 1.796875q0.59375 1.15625 0.59375 2.625q0 1.46875 -0.59375 2.625q-0.59375 1.140625 -1.65625 1.78125q-1.046875 0.640625 -2.328125 0.640625q-0.96875 0 -1.734375 -0.40625q-0.75 -0.40625 -1.125 -1.03125l-0.140625 0l0.140625 1.34375l0 3.828125l-2.4375 0l0 -13.546875zm4.859375 7.5625q0.6875 0 1.28125 -0.359375q0.609375 -0.359375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.453125q0 -0.828125 -0.359375 -1.453125q-0.34375 -0.640625 -0.953125 -0.984375q-0.59375 -0.34375 -1.28125 -0.34375q-0.671875 0 -1.265625 0.34375q-0.59375 0.34375 -0.953125 0.984375q-0.34375 0.625 -0.34375 1.453125q0 0.8125 0.34375 1.453125q0.359375 0.625 0.953125 0.984375q0.59375 0.359375 1.265625 0.359375zm6.713928 -11.40625l2.453125 0l0 3.375l-0.15625 1.65625l0.15625 0q0.40625 -0.671875 1.1875 -1.078125q0.78125 -0.40625 1.703125 -0.40625q1.734375 0 2.65625 1.03125q0.921875 1.03125 0.921875 2.875l0 5.90625l-2.4375 0l0 -5.609375q0 -0.921875 -0.5 -1.421875q-0.5 -0.5 -1.3125 -0.5q-0.96875 0 -1.59375 0.78125q-0.625 0.78125 -0.625 1.921875l0 4.828125l-2.453125 0l0 -13.359375z" fill-rule="nonzero"/><path fill="#ffffff" d="m207.81186 256.85458l2.515625 0l0 13.359375l-2.515625 0l0 -13.359375zm5.2158203 3.84375l2.296875 0l0 1.1875l0.15625 0q0.4375 -0.6875 1.1875 -1.078125q0.765625 -0.40625 1.703125 -0.40625q1.0 0 1.75 0.484375q0.75 0.46875 1.0625 1.1875q0.46875 -0.734375 1.28125 -1.203125q0.8125 -0.46875 1.90625 -0.46875q1.625 0 2.46875 1.0q0.859375 1.0 0.859375 2.6875l0 6.125l-2.421875 0l0 -5.671875q0 -0.890625 -0.421875 -1.375q-0.421875 -0.484375 -1.171875 -0.484375q-0.953125 0 -1.53125 0.75q-0.5625 0.75 -0.5625 1.984375l0 4.796875l-2.4375 0l0 -5.671875q0 -0.890625 -0.4375 -1.375q-0.4375 -0.484375 -1.234375 -0.484375q-0.90625 0 -1.453125 0.75q-0.546875 0.75 -0.546875 1.984375l0 4.796875l-2.453125 0l0 -9.515625zm16.93335 0l2.296875 0l0 1.15625l0.140625 0q0.390625 -0.609375 1.140625 -1.03125q0.75 -0.421875 1.71875 -0.421875q1.28125 0 2.328125 0.640625q1.0625 0.640625 1.65625 1.796875q0.59375 1.15625 0.59375 2.625q0 1.46875 -0.59375 2.625q-0.59375 1.140625 -1.65625 1.78125q-1.046875 0.640625 -2.328125 0.640625q-0.96875 0 -1.734375 -0.40625q-0.75 -0.40625 -1.125 -1.03125l-0.140625 0l0.140625 1.34375l0 3.828125l-2.4375 0l0 -13.546875zm4.859375 7.5625q0.6875 0 1.28125 -0.359375q0.609375 -0.359375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.453125q0 -0.828125 -0.359375 -1.453125q-0.34375 -0.640625 -0.953125 -0.984375q-0.59375 -0.34375 -1.28125 -0.34375q-0.671875 0 -1.265625 0.34375q-0.59375 0.34375 -0.953125 0.984375q-0.34375 0.625 -0.34375 1.453125q0 0.8125 0.34375 1.453125q0.359375 0.625 0.953125 0.984375q0.59375 0.359375 1.265625 0.359375zm11.463089 2.25q-1.484375 0 -2.640625 -0.65625q-1.15625 -0.671875 -1.796875 -1.8125q-0.640625 -1.15625 -0.640625 -2.578125q0 -1.421875 0.640625 -2.578125q0.640625 -1.15625 1.796875 -1.8125q1.15625 -0.671875 2.640625 -0.671875q1.453125 0 2.609375 0.671875q1.15625 0.65625 1.796875 1.8125q0.640625 1.15625 0.640625 2.578125q0 1.421875 -0.640625 2.578125q-0.640625 1.140625 -1.796875 1.8125q-1.15625 0.65625 -2.609375 0.65625zm0 -2.25q0.703125 0 1.296875 -0.328125q0.609375 -0.34375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.484375q0 -0.84375 -0.359375 -1.46875q-0.34375 -0.640625 -0.953125 -0.96875q-0.59375 -0.34375 -1.296875 -0.34375q-0.71875 0 -1.3125 0.34375q-0.59375 0.328125 -0.96875 0.96875q-0.359375 0.625 -0.359375 1.46875q0 0.828125 0.359375 1.484375q0.375 0.640625 0.96875 0.984375q0.59375 0.328125 1.3125 0.328125zm18.927567 2.109375q-0.703125 0 -1.296875 -0.21875q-0.59375 -0.21875 -0.96875 -0.59375q-0.875 -0.84375 -0.875 -2.390625l0 -4.375l-1.671875 0l0 -2.09375l1.671875 0l0 -2.6875l2.4375 0l0 2.6875l2.328125 0l0 2.09375l-2.328125 0l0 3.9375q0 0.71875 0.3125 1.046875q0.265625 0.296875 0.90625 0.296875q0.34375 0 0.578125 -0.09375q0.234375 -0.09375 0.609375 -0.328125l0 2.390625q-0.796875 0.328125 -1.703125 0.328125zm-12.203125 -9.671875l2.296875 0l0 1.265625l0.15625 0q0.375 -0.6875 1.140625 -1.109375q0.765625 -0.421875 1.640625 -0.421875q0.625 0 0.890625 0.125l0 2.375q-0.46875 -0.1875 -1.15625 -0.1875q-1.15625 0 -1.84375 0.75q-0.671875 0.75 -0.671875 2.03125l0 4.6875l-2.453125 0l0 -9.515625z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m120.3832 422.41995c9.523109 0 14.286385 -42.527557 19.046227 -85.055115c4.759842 -42.527557 9.516251 -85.05513 19.032516 -85.05513" fill-rule="evenodd"/><path stroke="#3f3f3f" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m120.3832 422.41995c9.523109 0 14.286385 -42.527557 19.046227 -85.055115c2.3799286 -21.263794 4.7589874 -42.527557 7.732605 -58.475403c0.74339294 -3.986969 1.5239563 -7.6416626 2.3509827 -10.881073c0.41352844 -1.6197205 0.83865356 -3.1355896 1.2765656 -4.5372314c0.10946655 -0.35043335 0.21974182 -0.6937256 0.33084106 -1.0296936c0.055541992 -0.16796875 0.11128235 -0.33413696 0.16725159 -0.49847412l0.0038452148 -0.011138916" fill-rule="evenodd"/><path fill="#3f3f3f" stroke="#3f3f3f" stroke-width="2.0" stroke-linecap="butt" d="m153.94038 263.90576l2.7744904 -9.251617l-8.072205 5.3037415z" fill-rule="evenodd"/><path fill="#f6921e" d="m352.0098 221.85857l0 0c0 -8.404755 6.8134155 -15.21814 15.21817 -15.21814l119.04401 0c4.0361023 0 7.906891 1.6033325 10.760864 4.4572906c2.8539429 2.8539581 4.4572754 6.7247467 4.4572754 10.760849l0 60.870773c0 8.404755 -6.813385 15.21817 -15.21814 15.21817l-119.04401 0c-8.404755 0 -15.21817 -6.8134155 -15.21817 -15.21817z" fill-rule="evenodd"/><path fill="#ffffff" d="m375.99768 248.51083q-1.953125 0 -3.546875 -0.921875q-1.59375 -0.921875 -2.515625 -2.515625q-0.90625 -1.609375 -0.90625 -3.53125q0 -1.921875 0.90625 -3.53125q0.921875 -1.609375 2.515625 -2.53125q1.59375 -0.921875 3.546875 -0.921875q1.96875 0 3.5625 0.921875q1.59375 0.921875 2.5 2.53125q0.921875 1.609375 0.921875 3.53125q0 1.921875 -0.921875 3.53125q-0.90625 1.59375 -2.5 2.515625q-1.59375 0.921875 -3.5625 0.921875zm0 -2.390625q1.234375 0 2.265625 -0.578125q1.03125 -0.578125 1.609375 -1.609375q0.59375 -1.046875 0.59375 -2.390625q0 -1.34375 -0.59375 -2.390625q-0.578125 -1.046875 -1.609375 -1.625q-1.03125 -0.578125 -2.265625 -0.578125q-1.21875 0 -2.234375 0.578125q-1.015625 0.578125 -1.609375 1.625q-0.59375 1.046875 -0.59375 2.390625q0 1.34375 0.59375 2.390625q0.59375 1.03125 1.609375 1.609375q1.015625 0.578125 2.234375 0.578125zm8.96814 -7.421875l2.296875 0l0 1.15625l0.140625 0q0.390625 -0.609375 1.140625 -1.03125q0.75 -0.421875 1.71875 -0.421875q1.28125 0 2.328125 0.640625q1.0625 0.640625 1.65625 1.796875q0.59375 1.15625 0.59375 2.625q0 1.46875 -0.59375 2.625q-0.59375 1.140625 -1.65625 1.78125q-1.046875 0.640625 -2.328125 0.640625q-0.96875 0 -1.734375 -0.40625q-0.75 -0.40625 -1.125 -1.03125l-0.140625 0l0.140625 1.34375l0 3.828125l-2.4375 0l0 -13.546875zm4.859375 7.5625q0.6875 0 1.28125 -0.359375q0.609375 -0.359375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.453125q0 -0.828125 -0.359375 -1.453125q-0.34375 -0.640625 -0.953125 -0.984375q-0.59375 -0.34375 -1.28125 -0.34375q-0.671875 0 -1.265625 0.34375q-0.59375 0.34375 -0.953125 0.984375q-0.34375 0.625 -0.34375 1.453125q0 0.8125 0.34375 1.453125q0.359375 0.625 0.953125 0.984375q0.59375 0.359375 1.265625 0.359375zm10.667877 2.109375q-0.703125 0 -1.296875 -0.21875q-0.59375 -0.21875 -0.96875 -0.59375q-0.875 -0.84375 -0.875 -2.390625l0 -4.375l-1.671875 0l0 -2.09375l1.671875 0l0 -2.6875l2.4375 0l0 2.6875l2.328125 0l0 2.09375l-2.328125 0l0 3.9375q0 0.71875 0.3125 1.046875q0.265625 0.296875 0.90625 0.296875q0.34375 0 0.578125 -0.09375q0.234375 -0.09375 0.609375 -0.328125l0 2.390625q-0.796875 0.328125 -1.703125 0.328125zm5.141632 -10.71875q-0.65625 0 -1.125 -0.46875q-0.46875 -0.46875 -0.46875 -1.109375q0 -0.65625 0.46875 -1.109375q0.46875 -0.46875 1.125 -0.46875q0.65625 0 1.109375 0.46875q0.453125 0.453125 0.453125 1.109375q0 0.640625 -0.453125 1.109375q-0.453125 0.46875 -1.109375 0.46875zm-1.234375 1.046875l2.4375 0l0 9.515625l-2.4375 0l0 -9.515625zm4.9742126 0l2.296875 0l0 1.1875l0.15625 0q0.4375 -0.6875 1.1875 -1.078125q0.765625 -0.40625 1.703125 -0.40625q1.0 0 1.75 0.484375q0.75 0.46875 1.0625 1.1875q0.46875 -0.734375 1.28125 -1.203125q0.8125 -0.46875 1.90625 -0.46875q1.625 0 2.46875 1.0q0.859375 1.0 0.859375 2.6875l0 6.125l-2.421875 0l0 -5.671875q0 -0.890625 -0.421875 -1.375q-0.421875 -0.484375 -1.171875 -0.484375q-0.953125 0 -1.53125 0.75q-0.5625 0.75 -0.5625 1.984375l0 4.796875l-2.4375 0l0 -5.671875q0 -0.890625 -0.4375 -1.375q-0.4375 -0.484375 -1.234375 -0.484375q-0.90625 0 -1.453125 0.75q-0.546875 0.75 -0.546875 1.984375l0 4.796875l-2.453125 0l0 -9.515625zm18.386475 -1.046875q-0.65625 0 -1.125 -0.46875q-0.46875 -0.46875 -0.46875 -1.109375q0 -0.65625 0.46875 -1.109375q0.46875 -0.46875 1.125 -0.46875q0.65625 0 1.109375 0.46875q0.453125 0.453125 0.453125 1.109375q0 0.640625 -0.453125 1.109375q-0.453125 0.46875 -1.109375 0.46875zm-1.234375 1.046875l2.4375 0l0 9.515625l-2.4375 0l0 -9.515625zm4.4898376 7.515625l4.96875 -5.4375l-4.8125 0l0 -2.078125l7.8125 0l0 2.0l-4.953125 5.4375l5.109375 0l0 2.078125l-8.125 0l0 -2.0zm12.75293 2.296875q-1.515625 0 -2.484375 -0.875q-0.96875 -0.890625 -0.96875 -2.328125q0 -0.953125 0.5 -1.671875q0.5 -0.734375 1.375 -1.125q0.890625 -0.390625 1.953125 -0.390625q1.46875 0 2.515625 0.421875l0 -0.40625q0 -0.78125 -0.59375 -1.265625q-0.578125 -0.484375 -1.578125 -0.484375q-0.671875 0 -1.296875 0.3125q-0.625 0.296875 -1.03125 0.796875l-1.5625 -1.234375q0.6875 -0.890625 1.734375 -1.375q1.046875 -0.484375 2.265625 -0.484375q2.1875 0 3.328125 1.015625q1.140625 1.0 1.140625 2.9375l0 5.859375l-2.40625 0l0 -0.96875l-0.15625 0q-0.421875 0.5625 -1.125 0.921875q-0.703125 0.34375 -1.609375 0.34375zm0.578125 -1.90625q1.0625 0 1.6875 -0.671875q0.625 -0.6875 0.625 -1.59375q-0.953125 -0.453125 -2.0 -0.453125q-1.90625 0 -1.90625 1.421875q0 0.578125 0.40625 0.9375q0.421875 0.359375 1.1875 0.359375zm10.9201355 1.765625q-0.703125 0 -1.296875 -0.21875q-0.59375 -0.21875 -0.96875 -0.59375q-0.875 -0.84375 -0.875 -2.390625l0 -4.375l-1.671875 0l0 -2.09375l1.671875 0l0 -2.6875l2.4375 0l0 2.6875l2.328125 0l0 2.09375l-2.328125 0l0 3.9375q0 0.71875 0.3125 1.046875q0.265625 0.296875 0.90625 0.296875q0.34375 0 0.578125 -0.09375q0.234375 -0.09375 0.609375 -0.328125l0 2.390625q-0.796875 0.328125 -1.703125 0.328125zm5.141632 -10.71875q-0.65625 0 -1.125 -0.46875q-0.46875 -0.46875 -0.46875 -1.109375q0 -0.65625 0.46875 -1.109375q0.46875 -0.46875 1.125 -0.46875q0.65625 0 1.109375 0.46875q0.453125 0.453125 0.453125 1.109375q0 0.640625 -0.453125 1.109375q-0.453125 0.46875 -1.109375 0.46875zm-1.234375 1.046875l2.4375 0l0 9.515625l-2.4375 0l0 -9.515625zm9.44339 9.8125q-1.484375 0 -2.640625 -0.65625q-1.15625 -0.671875 -1.796875 -1.8125q-0.640625 -1.15625 -0.640625 -2.578125q0 -1.421875 0.640625 -2.578125q0.640625 -1.15625 1.796875 -1.8125q1.15625 -0.671875 2.640625 -0.671875q1.453125 0 2.609375 0.671875q1.15625 0.65625 1.796875 1.8125q0.640625 1.15625 0.640625 2.578125q0 1.421875 -0.640625 2.578125q-0.640625 1.140625 -1.796875 1.8125q-1.15625 0.65625 -2.609375 0.65625zm0 -2.25q0.703125 0 1.296875 -0.328125q0.609375 -0.34375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.484375q0 -0.84375 -0.359375 -1.46875q-0.34375 -0.640625 -0.953125 -0.96875q-0.59375 -0.34375 -1.296875 -0.34375q-0.71875 0 -1.3125 0.34375q-0.59375 0.328125 -0.96875 0.96875q-0.359375 0.625 -0.359375 1.46875q0 0.828125 0.359375 1.484375q0.375 0.640625 0.96875 0.984375q0.59375 0.328125 1.3125 0.328125zm6.7244263 -7.5625l2.296875 0l0 1.1875l0.15625 0q0.421875 -0.6875 1.203125 -1.078125q0.796875 -0.40625 1.703125 -0.40625q1.71875 0 2.609375 1.046875q0.90625 1.046875 0.90625 2.859375l0 5.90625l-2.453125 0l0 -5.609375q0 -0.921875 -0.46875 -1.421875q-0.46875 -0.5 -1.296875 -0.5q-1.0 0 -1.609375 0.765625q-0.59375 0.765625 -0.59375 1.921875l0 4.84375l-2.453125 0l0 -9.515625zm9.98761 9.515625l0 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m378.10632 267.44833l1.40625 1.703125l-1.765625 1.359375l-1.0 -1.1875q-1.296875 1.1875 -3.1875 1.1875q-1.375 0 -2.515625 -0.59375q-1.140625 -0.59375 -1.796875 -1.671875q-0.640625 -1.078125 -0.640625 -2.421875q0 -1.109375 0.546875 -2.109375q0.5625 -1.015625 1.546875 -1.734375q-0.796875 -1.109375 -0.796875 -2.09375q0 -0.9375 0.4375 -1.6875q0.453125 -0.765625 1.234375 -1.203125q0.796875 -0.4375 1.78125 -0.4375q1.28125 0 2.109375 0.59375q0.84375 0.59375 1.21875 1.609375l-1.96875 1.046875q-0.40625 -1.109375 -1.328125 -1.109375q-0.515625 0 -0.84375 0.328125q-0.3125 0.3125 -0.3125 0.796875q0 0.28125 0.109375 0.53125q0.125 0.25 0.359375 0.546875l4.03125 4.875l1.078125 -1.78125l1.78125 1.125l-1.484375 2.328125zm-4.421875 0.859375q1.015625 0 1.703125 -0.625l-3.359375 -4.078125q-0.515625 0.390625 -0.796875 0.953125q-0.28125 0.5625 -0.28125 1.171875q0 0.765625 0.375 1.359375q0.375 0.59375 1.0 0.90625q0.640625 0.3125 1.359375 0.3125zm6.46286 1.90625l0 0zm11.851593 0.296875q-1.953125 0 -3.5625 -0.921875q-1.59375 -0.921875 -2.515625 -2.515625q-0.921875 -1.59375 -0.921875 -3.53125q0 -1.953125 0.921875 -3.546875q0.921875 -1.59375 2.515625 -2.515625q1.609375 -0.921875 3.5625 -0.921875q3.015625 0 5.0 2.1875l-1.78125 1.71875q-1.28125 -1.515625 -3.203125 -1.515625q-1.25 0 -2.28125 0.578125q-1.015625 0.5625 -1.609375 1.609375q-0.59375 1.03125 -0.59375 2.40625q0 1.359375 0.59375 2.40625q0.59375 1.03125 1.609375 1.609375q1.03125 0.5625 2.28125 0.5625q2.109375 0 3.53125 -1.75l1.78125 1.703125q-0.984375 1.1875 -2.34375 1.8125q-1.34375 0.625 -2.984375 0.625zm11.509491 0q-1.484375 0 -2.640625 -0.65625q-1.15625 -0.671875 -1.796875 -1.8125q-0.640625 -1.15625 -0.640625 -2.578125q0 -1.421875 0.640625 -2.578125q0.640625 -1.15625 1.796875 -1.8125q1.15625 -0.671875 2.640625 -0.671875q1.453125 0 2.609375 0.671875q1.15625 0.65625 1.796875 1.8125q0.640625 1.15625 0.640625 2.578125q0 1.421875 -0.640625 2.578125q-0.640625 1.140625 -1.796875 1.8125q-1.15625 0.65625 -2.609375 0.65625zm0 -2.25q0.703125 0 1.296875 -0.328125q0.609375 -0.34375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.484375q0 -0.84375 -0.359375 -1.46875q-0.34375 -0.640625 -0.953125 -0.96875q-0.59375 -0.34375 -1.296875 -0.34375q-0.71875 0 -1.3125 0.34375q-0.59375 0.328125 -0.96875 0.96875q-0.359375 0.625 -0.359375 1.46875q0 0.828125 0.359375 1.484375q0.375 0.640625 0.96875 0.984375q0.59375 0.328125 1.3125 0.328125zm6.7244263 -7.5625l2.296875 0l0 1.1875l0.15625 0q0.421875 -0.6875 1.203125 -1.078125q0.796875 -0.40625 1.703125 -0.40625q1.71875 0 2.609375 1.046875q0.90625 1.046875 0.90625 2.859375l0 5.90625l-2.453125 0l0 -5.609375q0 -0.921875 -0.46875 -1.421875q-0.46875 -0.5 -1.296875 -0.5q-1.0 0 -1.609375 0.765625q-0.59375 0.765625 -0.59375 1.921875l0 4.84375l-2.453125 0l0 -9.515625zm9.739258 0l2.78125 0l2.40625 6.1875l0.15625 0l2.4375 -6.1875l2.75 0l-4.078125 9.515625l-2.421875 0l-4.03125 -9.515625zm16.034576 9.8125q-1.421875 0 -2.5625 -0.640625q-1.140625 -0.65625 -1.78125 -1.796875q-0.640625 -1.15625 -0.640625 -2.59375q0 -1.359375 0.625 -2.53125q0.640625 -1.171875 1.75 -1.859375q1.125 -0.6875 2.515625 -0.6875q1.46875 0 2.53125 0.640625q1.0625 0.625 1.609375 1.734375q0.5625 1.09375 0.5625 2.4375q0 0.421875 -0.0625 0.828125l-7.140625 0q0.1875 1.125 0.90625 1.71875q0.71875 0.59375 1.71875 0.59375q0.84375 0 1.453125 -0.359375q0.609375 -0.375 0.953125 -0.984375l1.984375 0.96875q-1.453125 2.53125 -4.421875 2.53125zm2.25 -6.203125q-0.03125 -0.453125 -0.328125 -0.890625q-0.28125 -0.4375 -0.796875 -0.71875q-0.515625 -0.296875 -1.203125 -0.296875q-0.875 0 -1.5 0.515625q-0.625 0.5 -0.90625 1.390625l4.734375 0zm4.2324524 -3.609375l2.296875 0l0 1.265625l0.15625 0q0.359375 -0.671875 1.09375 -1.109375q0.75 -0.4375 1.640625 -0.4375q0.65625 0 1.203125 0.203125l0 2.453125q-0.5 -0.1875 -0.859375 -0.265625q-0.34375 -0.09375 -0.734375 -0.09375q-1.078125 0 -1.71875 0.78125q-0.625 0.78125 -0.625 1.984375l0 4.734375l-2.453125 0l0 -9.515625zm11.2178955 9.8125q-1.734375 0 -2.8125 -0.703125q-1.0625 -0.71875 -1.5 -1.84375l2.1875 -0.953125q0.296875 0.671875 0.84375 1.015625q0.546875 0.34375 1.28125 0.34375q0.671875 0 1.109375 -0.21875q0.453125 -0.21875 0.453125 -0.703125q0 -0.46875 -0.40625 -0.6875q-0.40625 -0.234375 -1.234375 -0.421875l-1.109375 -0.25q-1.15625 -0.28125 -1.921875 -1.015625q-0.765625 -0.734375 -0.765625 -1.84375q0 -0.8125 0.484375 -1.46875q0.5 -0.65625 1.34375 -1.0q0.859375 -0.359375 1.875 -0.359375q2.953125 0 3.9375 2.0625l-2.078125 0.921875q-0.5625 -1.0 -1.796875 -1.0q-0.640625 0 -1.015625 0.234375q-0.359375 0.21875 -0.359375 0.578125q0 0.671875 1.265625 1.015625l1.390625 0.328125q1.421875 0.359375 2.140625 1.09375q0.71875 0.71875 0.71875 1.78125q0 0.90625 -0.53125 1.609375q-0.53125 0.703125 -1.453125 1.09375q-0.921875 0.390625 -2.046875 0.390625zm7.093689 -10.859375q-0.65625 0 -1.125 -0.46875q-0.46875 -0.46875 -0.46875 -1.109375q0 -0.65625 0.46875 -1.109375q0.46875 -0.46875 1.125 -0.46875q0.65625 0 1.109375 0.46875q0.453125 0.453125 0.453125 1.109375q0 0.640625 -0.453125 1.109375q-0.453125 0.46875 -1.109375 0.46875zm-1.234375 1.046875l2.4375 0l0 9.515625l-2.4375 0l0 -9.515625zm9.443359 9.8125q-1.484375 0 -2.640625 -0.65625q-1.15625 -0.671875 -1.796875 -1.8125q-0.640625 -1.15625 -0.640625 -2.578125q0 -1.421875 0.640625 -2.578125q0.640625 -1.15625 1.796875 -1.8125q1.15625 -0.671875 2.640625 -0.671875q1.453125 0 2.609375 0.671875q1.15625 0.65625 1.796875 1.8125q0.640625 1.15625 0.640625 2.578125q0 1.421875 -0.640625 2.578125q-0.640625 1.140625 -1.796875 1.8125q-1.15625 0.65625 -2.609375 0.65625zm0 -2.25q0.703125 0 1.296875 -0.328125q0.609375 -0.34375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.484375q0 -0.84375 -0.359375 -1.46875q-0.34375 -0.640625 -0.953125 -0.96875q-0.59375 -0.34375 -1.296875 -0.34375q-0.71875 0 -1.3125 0.34375q-0.59375 0.328125 -0.96875 0.96875q-0.359375 0.625 -0.359375 1.46875q0 0.828125 0.359375 1.484375q0.375 0.640625 0.96875 0.984375q0.59375 0.328125 1.3125 0.328125zm6.724457 -7.5625l2.296875 0l0 1.1875l0.15625 0q0.421875 -0.6875 1.203125 -1.078125q0.796875 -0.40625 1.703125 -0.40625q1.71875 0 2.609375 1.046875q0.90625 1.046875 0.90625 2.859375l0 5.90625l-2.453125 0l0 -5.609375q0 -0.921875 -0.46875 -1.421875q-0.46875 -0.5 -1.296875 -0.5q-1.0 0 -1.609375 0.765625q-0.59375 0.765625 -0.59375 1.921875l0 4.84375l-2.453125 0l0 -9.515625z" fill-rule="nonzero"/><path fill="#db4437" d="m722.64044 224.63551l0 0c0 -9.399185 7.619507 -17.018723 17.018677 -17.018723l68.072815 0l0 0c4.513672 0 8.842407 1.793045 12.034058 4.98468c3.1916504 3.1916199 4.98468 7.520401 4.98468 12.034042l0 404.5453c0 9.39917 -7.619568 17.018677 -17.018738 17.018677l-68.072815 0c-9.39917 0 -17.018677 -7.619507 -17.018677 -17.018677z" fill-rule="evenodd"/><path fill="#ffffff" d="m740.59766 420.46875l4.5 0q1.109375 0 2.0625 0.515625q0.953125 0.5 1.515625 1.40625q0.578125 0.890625 0.578125 2.03125q0 0.9375 -0.421875 1.75q-0.421875 0.796875 -1.171875 1.34375q-0.734375 0.53125 -1.625 0.71875l-0.03125 0.046875l3.765625 5.46875l0 0.078125l-1.859375 0l-3.640625 -5.453125l-2.09375 0l0 5.453125l-1.578125 0l0 -13.359375zm4.40625 6.4375q0.703125 0 1.296875 -0.3125q0.609375 -0.328125 0.96875 -0.890625q0.375 -0.5625 0.375 -1.28125q0 -0.609375 -0.328125 -1.171875q-0.3125 -0.5625 -0.890625 -0.90625q-0.5625 -0.359375 -1.296875 -0.359375l-2.953125 0l0 4.921875l2.828125 0zm9.464661 7.21875q-1.71875 0 -2.625 -1.0q-0.90625 -1.015625 -0.90625 -2.828125l0 -5.984375l1.59375 0l0 5.75q0 1.359375 0.609375 2.0q0.625 0.625 1.671875 0.625q0.796875 0 1.421875 -0.421875q0.625 -0.4375 0.96875 -1.125q0.34375 -0.6875 0.34375 -1.453125l0 -5.375l1.59375 0l0 9.515625l-1.515625 0l0 -1.375l-0.078125 0q-0.390625 0.703125 -1.25 1.1875q-0.859375 0.484375 -1.828125 0.484375zm7.0130615 -9.8125l1.515625 0l0 1.40625l0.078125 0q0.390625 -0.71875 1.25 -1.203125q0.859375 -0.5 1.828125 -0.5q1.71875 0 2.609375 1.0q0.90625 1.0 0.90625 2.71875l0 6.09375l-1.578125 0l0 -5.859375q0 -1.328125 -0.640625 -1.921875q-0.625 -0.59375 -1.734375 -0.59375q-0.765625 0 -1.375 0.4375q-0.59375 0.421875 -0.9375 1.125q-0.328125 0.6875 -0.328125 1.453125l0 5.359375l-1.59375 0l0 -9.515625zm13.655701 9.671875q-0.578125 0 -1.078125 -0.1875q-0.484375 -0.1875 -0.828125 -0.515625q-0.734375 -0.703125 -0.734375 -1.953125l0 -5.578125l-1.671875 0l0 -1.4375l1.671875 0l0 -2.6875l1.578125 0l0 2.6875l2.328125 0l0 1.4375l-2.328125 0l0 5.1875q0 0.78125 0.296875 1.15625q0.359375 0.40625 1.03125 0.40625q0.578125 0 1.046875 -0.3125l0 1.546875q-0.28125 0.125 -0.578125 0.1875q-0.28125 0.0625 -0.734375 0.0625zm4.20282 -11.421875q-0.46875 0 -0.8125 -0.328125q-0.328125 -0.34375 -0.328125 -0.8125q0 -0.484375 0.328125 -0.8125q0.34375 -0.328125 0.8125 -0.328125q0.484375 0 0.8125 0.328125q0.328125 0.328125 0.328125 0.8125q0 0.46875 -0.328125 0.8125q-0.328125 0.328125 -0.8125 0.328125zm-0.78125 1.75l1.578125 0l0 9.515625l-1.578125 0l0 -9.515625zm4.0600586 0l1.515625 0l0 1.40625l0.078125 0q0.40625 -0.734375 1.21875 -1.21875q0.828125 -0.484375 1.75 -0.484375q1.03125 0 1.8125 0.5q0.78125 0.5 1.109375 1.34375q0.5 -0.84375 1.328125 -1.34375q0.84375 -0.5 1.9375 -0.5q1.640625 0 2.46875 1.0q0.828125 1.0 0.828125 2.71875l0 6.09375l-1.5625 0l0 -5.859375q0 -1.328125 -0.546875 -1.921875q-0.53125 -0.59375 -1.59375 -0.59375q-0.71875 0 -1.296875 0.40625q-0.578125 0.40625 -0.90625 1.109375q-0.3125 0.6875 -0.3125 1.484375l0 5.375l-1.59375 0l0 -5.84375q0 -1.34375 -0.546875 -1.9375q-0.53125 -0.59375 -1.578125 -0.59375q-0.703125 0 -1.28125 0.421875q-0.578125 0.421875 -0.90625 1.125q-0.328125 0.6875 -0.328125 1.5l0 5.328125l-1.59375 0l0 -9.515625zm20.57721 9.8125q-1.359375 0 -2.4375 -0.640625q-1.078125 -0.65625 -1.6875 -1.796875q-0.609375 -1.15625 -0.609375 -2.59375q0 -1.359375 0.5625 -2.53125q0.578125 -1.171875 1.625 -1.859375q1.0625 -0.6875 2.421875 -0.6875q1.390625 0 2.421875 0.625q1.03125 0.625 1.578125 1.734375q0.546875 1.09375 0.546875 2.515625q0 0.21875 -0.03125 0.484375l-7.546875 0q0.0625 1.078125 0.53125 1.828125q0.46875 0.734375 1.1875 1.109375q0.71875 0.375 1.5 0.375q1.859375 0 2.8125 -1.71875l1.34375 0.65625q-0.59375 1.140625 -1.65625 1.828125q-1.0625 0.671875 -2.5625 0.671875zm2.75 -6.0625q-0.046875 -0.59375 -0.34375 -1.1875q-0.296875 -0.609375 -0.9375 -1.015625q-0.625 -0.40625 -1.59375 -0.40625q-1.125 0 -1.90625 0.71875q-0.78125 0.71875 -1.03125 1.890625l5.8125 0z" fill-rule="nonzero"/><path fill="#741b47" d="m843.1496 295.00012l0 0c0 -3.3804626 2.7404175 -6.1208496 6.1208496 -6.1208496l100.010254 0c1.623352 0 3.1802368 0.64486694 4.328125 1.7927551c1.1478882 1.1478882 1.7927246 2.7047424 1.7927246 4.3280945l0 24.482697c0 3.3804626 -2.7403564 6.1208496 -6.1208496 6.1208496l-100.010254 0c-3.3804321 0 -6.1208496 -2.740387 -6.1208496 -6.1208496z" fill-rule="evenodd"/><path fill="#ffffff" d="m890.1718 313.35147q-1.671875 0 -3.03125 -0.78125q-1.359375 -0.78125 -2.140625 -2.140625q-0.765625 -1.375 -0.765625 -3.0625q0 -1.671875 0.765625 -3.03125q0.78125 -1.375 2.140625 -2.15625q1.359375 -0.796875 3.03125 -0.796875q1.28125 0 2.34375 0.484375q1.0625 0.484375 1.84375 1.390625l-0.96875 0.953125q-0.65625 -0.796875 -1.4375 -1.15625q-0.78125 -0.375 -1.78125 -0.375q-1.25 0 -2.296875 0.59375q-1.03125 0.578125 -1.65625 1.640625q-0.609375 1.0625 -0.609375 2.453125q0 1.390625 0.609375 2.46875q0.625 1.0625 1.65625 1.640625q1.046875 0.578125 2.296875 0.578125q2.078125 0 3.53125 -1.703125l1.0 0.953125q-0.78125 0.953125 -1.96875 1.5q-1.171875 0.546875 -2.5625 0.546875zm6.481018 -11.703125l3.859375 0q0.953125 0 1.765625 0.4375q0.828125 0.421875 1.3125 1.203125q0.484375 0.765625 0.484375 1.75q0 0.96875 -0.484375 1.75q-0.484375 0.78125 -1.3125 1.21875q-0.8125 0.421875 -1.765625 0.421875l-2.5 0l0 4.671875l-1.359375 0l0 -11.453125zm3.890625 5.484375q0.640625 0 1.125 -0.296875q0.484375 -0.3125 0.75 -0.78125q0.28125 -0.484375 0.28125 -1.015625q0 -0.53125 -0.28125 -1.0q-0.265625 -0.484375 -0.75 -0.78125q-0.484375 -0.3125 -1.125 -0.3125l-2.53125 0l0 4.1875l2.53125 0zm9.262878 6.21875q-1.234375 0 -2.203125 -0.5625q-0.953125 -0.5625 -1.484375 -1.578125q-0.515625 -1.015625 -0.515625 -2.34375l0 -7.21875l1.359375 0l0 7.28125q0 1.390625 0.734375 2.265625q0.734375 0.859375 2.109375 0.859375q1.375 0 2.109375 -0.859375q0.75 -0.875 0.75 -2.265625l0 -7.28125l1.359375 0l0 7.21875q0 1.328125 -0.5 2.359375q-0.5 1.015625 -1.46875 1.578125q-0.953125 0.546875 -2.25 0.546875z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m315.48337 252.29396l36.53543 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m315.48337 252.29396l24.535461 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="2.0" stroke-linecap="butt" d="m340.01883 255.59743l9.076202 -3.3034668l-9.076202 -3.3034668z" fill-rule="evenodd"/><path fill="#4285f4" d="m158.21262 319.31012l0 0c0 -4.6300354 3.753357 -8.383392 8.383362 -8.383392l140.24115 0c2.2233887 0 4.3557434 0.88323975 5.9279175 2.4554443c1.5722046 1.5721741 2.4554443 3.7045288 2.4554443 5.927948l0 33.53247c0 4.630005 -3.753357 8.383362 -8.383362 8.383362l-140.24115 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m195.9582 342.18634q-1.65625 0 -3.0 -0.796875q-1.34375 -0.796875 -2.109375 -2.171875q-0.765625 -1.375 -0.765625 -3.015625q0 -1.625 0.765625 -3.0q0.765625 -1.375 2.109375 -2.171875q1.34375 -0.8125 3.0 -0.8125q1.640625 0 2.984375 0.8125q1.34375 0.796875 2.109375 2.171875q0.78125 1.375 0.78125 3.0q0 1.640625 -0.78125 3.015625q-0.765625 1.375 -2.109375 2.171875q-1.34375 0.796875 -2.984375 0.796875zm0 -1.296875q1.234375 0 2.265625 -0.59375q1.03125 -0.59375 1.625 -1.65625q0.609375 -1.078125 0.609375 -2.4375q0 -1.359375 -0.609375 -2.421875q-0.59375 -1.0625 -1.625 -1.65625q-1.03125 -0.609375 -2.265625 -0.609375q-1.234375 0 -2.265625 0.609375q-1.03125 0.59375 -1.640625 1.65625q-0.59375 1.0625 -0.59375 2.421875q0 1.359375 0.59375 2.4375q0.609375 1.0625 1.640625 1.65625q1.03125 0.59375 2.265625 0.59375zm7.527771 -7.109375l1.296875 0l0 1.21875l0.0625 0q0.375 -0.625 1.109375 -1.046875q0.75 -0.4375 1.6875 -0.4375q1.09375 0 1.984375 0.5625q0.890625 0.5625 1.390625 1.5625q0.515625 0.984375 0.515625 2.21875q0 1.25 -0.515625 2.234375q-0.5 0.984375 -1.390625 1.546875q-0.890625 0.546875 -1.984375 0.546875q-0.9375 0 -1.6875 -0.421875q-0.734375 -0.421875 -1.109375 -1.03125l-0.0625 0l0.0625 1.125l0 3.53125l-1.359375 0l0 -11.609375zm4.0 7.1875q0.703125 0 1.328125 -0.390625q0.625 -0.390625 0.984375 -1.09375q0.375 -0.703125 0.375 -1.625q0 -0.9375 -0.375 -1.640625q-0.359375 -0.703125 -0.984375 -1.078125q-0.625 -0.390625 -1.328125 -0.390625q-0.71875 0 -1.34375 0.390625q-0.609375 0.375 -0.984375 1.078125q-0.375 0.703125 -0.375 1.640625q0 0.921875 0.375 1.625q0.375 0.703125 0.984375 1.09375q0.625 0.390625 1.34375 0.390625zm4.6305237 0.96875l0 0zm8.196381 0.25q-1.109375 0 -2.0 -0.546875q-0.890625 -0.5625 -1.40625 -1.546875q-0.5 -0.984375 -0.5 -2.234375q0 -1.234375 0.5 -2.21875q0.515625 -1.0 1.40625 -1.5625q0.890625 -0.5625 2.0 -0.5625q0.921875 0 1.65625 0.4375q0.734375 0.421875 1.125 1.046875l0.0625 0l-0.0625 -1.140625l0 -3.375l1.359375 0l0 11.453125l-1.296875 0l0 -1.203125l-0.0625 0q-0.390625 0.609375 -1.125 1.03125q-0.734375 0.421875 -1.65625 0.421875zm0.140625 -1.21875q0.71875 0 1.328125 -0.390625q0.625 -0.390625 1.0 -1.09375q0.375 -0.703125 0.375 -1.625q0 -0.9375 -0.375 -1.640625q-0.375 -0.703125 -1.0 -1.078125q-0.609375 -0.390625 -1.328125 -0.390625q-0.703125 0 -1.328125 0.390625q-0.625 0.375 -1.0 1.09375q-0.375 0.71875 -0.375 1.625q0 0.90625 0.375 1.625q0.375 0.703125 1.0 1.09375q0.625 0.390625 1.328125 0.390625zm9.567642 1.21875q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm9.566757 -4.71875q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-5.75 2.734375l-1.5 0l0 -1.234375l1.5 0l0 -0.890625q0 -0.765625 0.34375 -1.328125q0.359375 -0.578125 0.953125 -0.890625q0.59375 -0.3125 1.3125 -0.3125q0.6875 0 1.203125 0.1875l0 1.328125q-0.296875 -0.109375 -0.5625 -0.171875q-0.25 -0.078125 -0.59375 -0.078125q-0.53125 0 -0.921875 0.375q-0.375 0.375 -0.375 1.046875l0 0.734375l5.078125 0l0 8.15625l-1.359375 0l0 -6.921875l-3.71875 0l0 6.921875l-1.359375 0l0 -6.921875zm8.566513 -1.234375l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.46875 0 2.234375 0.859375q0.78125 0.859375 0.78125 2.328125l0 5.234375l-1.359375 0l0 -5.03125q0 -1.125 -0.546875 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625zm9.772888 -1.5q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm6.5360107 8.28125q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm3.6078796 -9.78125q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm7.1610107 8.40625q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm5.6260376 -7.1875l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.46875 0 2.234375 0.859375q0.78125 0.859375 0.78125 2.328125l0 5.234375l-1.359375 0l0 -5.03125q0 -1.125 -0.546875 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625z" fill-rule="nonzero"/><path fill="#4285f4" d="m158.21262 389.57376l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l140.24115 0c2.2233887 0 4.3557434 0.88323975 5.9279175 2.4554138c1.5722046 1.5722046 2.4554443 3.7045288 2.4554443 5.927948l0 33.53247c0 4.630005 -3.753357 8.383362 -8.383362 8.383362l-140.24115 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m187.12544 402.04373l-3.203125 0l0 -1.296875l7.765625 0l0 1.296875l-3.203125 0l0 10.15625l-1.359375 0l0 -10.15625zm7.5230103 9.640625l-3.375 -7.640625l1.46875 0l2.5625 6.0625l0.03125 0l2.46875 -6.0625l1.46875 0l-5.03125 11.609375l-1.40625 0l1.8125 -3.96875zm5.693512 -7.640625l1.296875 0l0 1.21875l0.0625 0q0.375 -0.625 1.109375 -1.046875q0.75 -0.4375 1.6875 -0.4375q1.09375 0 1.984375 0.5625q0.890625 0.5625 1.390625 1.5625q0.515625 0.984375 0.515625 2.21875q0 1.25 -0.515625 2.234375q-0.5 0.984375 -1.390625 1.546875q-0.890625 0.546875 -1.984375 0.546875q-0.9375 0 -1.6875 -0.421875q-0.734375 -0.421875 -1.109375 -1.03125l-0.0625 0l0.0625 1.125l0 3.53125l-1.359375 0l0 -11.609375zm4.0 7.1875q0.703125 0 1.328125 -0.390625q0.625 -0.390625 0.984375 -1.09375q0.375 -0.703125 0.375 -1.625q0 -0.9375 -0.375 -1.640625q-0.359375 -0.703125 -0.984375 -1.078125q-0.625 -0.390625 -1.328125 -0.390625q-0.71875 0 -1.34375 0.390625q-0.609375 0.375 -0.984375 1.078125q-0.375 0.703125 -0.375 1.640625q0 0.921875 0.375 1.625q0.375 0.703125 0.984375 1.09375q0.625 0.390625 1.34375 0.390625zm9.271149 1.21875q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm2.0236359 4.9375l0 0zm8.196381 0.25q-1.109375 0 -2.0 -0.546875q-0.890625 -0.5625 -1.40625 -1.546875q-0.5 -0.984375 -0.5 -2.234375q0 -1.234375 0.5 -2.21875q0.515625 -1.0 1.40625 -1.5625q0.890625 -0.5625 2.0 -0.5625q0.921875 0 1.65625 0.4375q0.734375 0.421875 1.125 1.046875l0.0625 0l-0.0625 -1.140625l0 -3.375l1.359375 0l0 11.453125l-1.296875 0l0 -1.203125l-0.0625 0q-0.390625 0.609375 -1.125 1.03125q-0.734375 0.421875 -1.65625 0.421875zm0.140625 -1.21875q0.71875 0 1.328125 -0.390625q0.625 -0.390625 1.0 -1.09375q0.375 -0.703125 0.375 -1.625q0 -0.9375 -0.375 -1.640625q-0.375 -0.703125 -1.0 -1.078125q-0.609375 -0.390625 -1.328125 -0.390625q-0.703125 0 -1.328125 0.390625q-0.625 0.375 -1.0 1.09375q-0.375 0.71875 -0.375 1.625q0 0.90625 0.375 1.625q0.375 0.703125 1.0 1.09375q0.625 0.390625 1.328125 0.390625zm9.567642 1.21875q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm9.566757 -4.71875q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-5.75 2.734375l-1.5 0l0 -1.234375l1.5 0l0 -0.890625q0 -0.765625 0.34375 -1.328125q0.359375 -0.578125 0.953125 -0.890625q0.59375 -0.3125 1.3125 -0.3125q0.6875 0 1.203125 0.1875l0 1.328125q-0.296875 -0.109375 -0.5625 -0.171875q-0.25 -0.078125 -0.59375 -0.078125q-0.53125 0 -0.921875 0.375q-0.375 0.375 -0.375 1.046875l0 0.734375l5.078125 0l0 8.15625l-1.359375 0l0 -6.921875l-3.71875 0l0 6.921875l-1.359375 0l0 -6.921875zm8.566513 -1.234375l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.4687653 0 2.2343903 0.859375q0.78125 0.859375 0.78125 2.328125l0 5.234375l-1.359375 0l0 -5.03125q0 -1.125 -0.54689026 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625zm9.772903 -1.5q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm6.5360107 8.28125q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm3.6078796 -9.78125q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm7.1610107 8.40625q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm5.6260376 -7.1875l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.46875 0 2.234375 0.859375q0.7812195 0.859375 0.7812195 2.328125l0 5.234375l-1.3593445 0l0 -5.03125q0 -1.125 -0.546875 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625z" fill-rule="nonzero"/><path fill="#4285f4" d="m158.21262 459.8374l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l140.24115 0c2.2233887 0 4.3557434 0.88323975 5.9279175 2.4554443c1.5722046 1.5721741 2.4554443 3.7045288 2.4554443 5.9279175l0 33.53247c0 4.630005 -3.753357 8.383392 -8.383362 8.383392l-140.24115 0c-4.630005 0 -8.383362 -3.7533875 -8.383362 -8.383392z" fill-rule="evenodd"/><path fill="#ffffff" d="m231.5017 482.46365l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0zm4.4453125 0l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0zm4.4453125 0l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0z" fill-rule="nonzero"/><path fill="#f6921e" d="m350.8194 319.30954l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l132.71356 0c2.2234192 0 4.355774 0.88323975 5.927948 2.4554443c1.5721741 1.5721741 2.4554443 3.7045288 2.4554443 5.9279175l0 33.5325c0 4.630005 -3.7533875 8.383362 -8.383392 8.383362l-132.71356 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m399.56876 320.98267l3.875 0q0.9375 0 1.75 0.4375q0.828125 0.421875 1.3125 1.203125q0.484375 0.765625 0.484375 1.75q0 0.796875 -0.359375 1.484375q-0.359375 0.6875 -1.0 1.15625q-0.625 0.46875 -1.390625 0.625l-0.03125 0.046875l3.234375 4.6875l0 0.0625l-1.609375 0l-3.109375 -4.671875l-1.796875 0l0 4.671875l-1.359375 0l0 -11.453125zm3.796875 5.515625q0.578125 0 1.09375 -0.265625q0.53125 -0.28125 0.84375 -0.765625q0.3125 -0.484375 0.3125 -1.09375q0 -0.53125 -0.28125 -1.0q-0.265625 -0.484375 -0.75 -0.78125q-0.484375 -0.3125 -1.125 -0.3125l-2.53125 0l0 4.21875l2.4375 0zm8.778503 6.1875q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm1.9730225 -3.21875l1.40625 0l1.9375 6.4375l0.015625 0l2.0625 -6.4375l1.390625 0l2.0625 6.421875l0.03125 0l1.921875 -6.421875l1.375 0l-2.625 8.15625l-1.390625 0l-2.109375 -6.515625l-2.09375 6.515625l-1.359375 0l-2.625 -8.15625zm13.338257 0l1.28125 0l0 1.3125l0.078125 0q0.234375 -0.671875 0.921875 -1.109375q0.6875 -0.453125 1.4375 -0.453125q0.5625 0 0.96875 0.171875l0 1.46875q-0.515625 -0.265625 -1.15625 -0.265625q-0.59375 0 -1.09375 0.34375q-0.5 0.328125 -0.796875 0.90625q-0.28125 0.5625 -0.28125 1.21875l0 4.5625l-1.359375 0l0 -8.15625zm6.7969055 -1.5q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm6.5360107 8.28125q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm6.003998 0.125q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm2.0236511 4.9375l0 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m391.18372 351.6858q-1.609375 0 -2.984375 -0.78125q-1.359375 -0.78125 -2.15625 -2.15625q-0.796875 -1.375 -0.796875 -3.046875q0 -1.65625 0.796875 -3.03125q0.796875 -1.375 2.15625 -2.15625q1.375 -0.796875 2.984375 -0.796875q1.265625 0 2.375 0.453125q1.109375 0.453125 1.828125 1.265625l-0.953125 0.96875q-0.5625 -0.671875 -1.421875 -1.03125q-0.84375 -0.359375 -1.8125 -0.359375q-1.203125 0 -2.265625 0.59375q-1.046875 0.578125 -1.6875 1.640625q-0.625 1.0625 -0.625 2.453125q0 1.390625 0.625 2.46875q0.640625 1.0625 1.6875 1.640625q1.0625 0.578125 2.265625 0.578125q1.15625 0 1.921875 -0.359375q0.78125 -0.359375 1.34375 -0.953125q0.421875 -0.4375 0.671875 -1.0625q0.25 -0.640625 0.3125 -1.40625l-4.234375 0l0 -1.265625l5.5 0q0.078125 0.453125 0.078125 0.828125q0 1.0625 -0.34375 2.0625q-0.328125 1.0 -1.0625 1.75q-1.578125 1.703125 -4.203125 1.703125zm10.912415 0q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm3.0392456 -3.21875l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.46875 0 2.234375 0.859375q0.78125 0.859375 0.78125 2.328125l0 5.234375l-1.359375 0l0 -5.03125q0 -1.125 -0.546875 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625zm12.601013 8.40625q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm3.0392761 -3.21875l1.28125 0l0 1.3125l0.078125 0q0.234375 -0.671875 0.921875 -1.109375q0.6875 -0.453125 1.4375 -0.453125q0.5625 0 0.96875 0.171875l0 1.46875q-0.515625 -0.265625 -1.15625 -0.265625q-0.59375 0 -1.09375 0.34375q-0.5 0.328125 -0.796875 0.90625q-0.28125 0.5625 -0.28125 1.21875l0 4.5625l-1.359375 0l0 -8.15625zm8.273743 8.40625q-0.890625 0 -1.59375 -0.34375q-0.703125 -0.359375 -1.09375 -0.96875q-0.375 -0.625 -0.375 -1.40625q0 -1.296875 0.96875 -2.015625q0.984375 -0.734375 2.46875 -0.734375q0.734375 0 1.359375 0.171875q0.640625 0.15625 0.96875 0.359375l0 -0.5q0 -0.90625 -0.640625 -1.453125q-0.625 -0.5625 -1.609375 -0.5625q-0.671875 0 -1.265625 0.296875q-0.578125 0.296875 -0.90625 0.828125l-1.03125 -0.765625q0.484375 -0.734375 1.328125 -1.15625q0.859375 -0.421875 1.875 -0.421875q1.671875 0 2.609375 0.875q0.9375 0.875 0.9375 2.375l0 5.171875l-1.296875 0l0 -1.171875l-0.0625 0q-0.34375 0.59375 -1.046875 1.015625q-0.703125 0.40625 -1.59375 0.40625zm0.140625 -1.1875q0.6875 0 1.265625 -0.34375q0.59375 -0.359375 0.9375 -0.953125q0.359375 -0.59375 0.359375 -1.296875q-0.375 -0.265625 -0.9375 -0.421875q-0.5625 -0.15625 -1.1875 -0.15625q-1.109375 0 -1.6875 0.46875q-0.5625 0.453125 -0.5625 1.1875q0 0.671875 0.5 1.09375q0.515625 0.421875 1.3125 0.421875zm8.726654 1.0625q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm6.128998 0.125q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm5.6260376 -7.1875l1.28125 0l0 1.3125l0.078125 0q0.234375 -0.671875 0.921875 -1.109375q0.6875 -0.453125 1.4375 -0.453125q0.5625 0 0.96875 0.171875l0 1.46875q-0.515625 -0.265625 -1.15625 -0.265625q-0.59375 0 -1.09375 0.34375q-0.5 0.328125 -0.796875 0.90625q-0.28125 0.5625 -0.28125 1.21875l0 4.5625l-1.359375 0l0 -8.15625zm8.523743 8.40625q-1.3125 0 -2.203125 -0.609375q-0.875 -0.609375 -1.234375 -1.578125l1.203125 -0.546875q0.3125 0.734375 0.90625 1.140625q0.609375 0.40625 1.328125 0.40625q0.765625 0 1.3125 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.515625 -0.4375 -0.828125q-0.4375 -0.3125 -1.359375 -0.53125l-1.0 -0.265625q-0.96875 -0.234375 -1.59375 -0.8125q-0.625 -0.578125 -0.625 -1.484375q0 -0.703125 0.421875 -1.234375q0.421875 -0.546875 1.125 -0.828125q0.703125 -0.296875 1.53125 -0.296875q1.0625 0 1.90625 0.46875q0.84375 0.46875 1.1875 1.296875l-1.171875 0.546875q-0.546875 -1.09375 -1.9375 -1.09375q-0.671875 0 -1.1875 0.3125q-0.5 0.3125 -0.5 0.796875q0 0.453125 0.34375 0.734375q0.359375 0.265625 1.0625 0.453125l1.1875 0.296875q1.203125 0.3125 1.8125 0.90625q0.609375 0.59375 0.609375 1.46875q0 0.75 -0.4375 1.3125q-0.4375 0.5625 -1.171875 0.875q-0.734375 0.296875 -1.625 0.296875z" fill-rule="nonzero"/><path fill="#f6921e" d="m350.8194 389.57285l0 0c0 -4.630005 3.753357 -8.383392 8.383362 -8.383392l132.71356 0c2.2234192 0 4.355774 0.88327026 5.927948 2.4554443c1.5721741 1.5721741 2.4554443 3.7045288 2.4554443 5.927948l0 33.53247c0 4.630005 -3.7533875 8.383362 -8.383392 8.383362l-132.71356 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m366.7207 400.74594l1.359375 0l0 10.15625l5.0 0l0 1.296875l-6.359375 0l0 -11.453125zm11.436371 11.703125q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm4.7197876 -7.1875l1.40625 0l1.9375 6.4375l0.015625 0l2.0625 -6.4375l1.390625 0l2.0625 6.421875l0.03125 0l1.921875 -6.421875l1.375 0l-2.625 8.15625l-1.390625 0l-2.109375 -6.515625l-2.09375 6.515625l-1.359375 0l-2.625 -8.15625zm16.803253 8.40625q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm3.0392761 -3.21875l1.28125 0l0 1.3125l0.078125 0q0.234375 -0.671875 0.921875 -1.109375q0.6875 -0.453125 1.4375 -0.453125q0.5625 0 0.96875 0.171875l0 1.46875q-0.515625 -0.265625 -1.15625 -0.265625q-0.59375 0 -1.09375 0.34375q-0.5 0.328125 -0.796875 0.90625q-0.28125 0.5625 -0.28125 1.21875l0 4.5625l-1.359375 0l0 -8.15625zm6.796875 -1.5q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm3.4910278 0l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.46875 0 2.234375 0.859375q0.78125 0.859375 0.78125 2.328125l0 5.234375l-1.359375 0l0 -5.03125q0 -1.125 -0.546875 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625zm12.474121 11.875q-1.4375 0 -2.375 -0.671875q-0.9375 -0.671875 -1.265625 -1.625l1.25 -0.53125q0.265625 0.703125 0.890625 1.140625q0.640625 0.453125 1.5 0.453125q1.265625 0 1.953125 -0.734375q0.703125 -0.734375 0.703125 -2.09375l0 -0.90625l-0.0625 0q-0.40625 0.625 -1.140625 1.015625q-0.71875 0.390625 -1.640625 0.390625q-1.0625 0 -1.9375 -0.53125q-0.875 -0.546875 -1.390625 -1.515625q-0.5 -0.984375 -0.5 -2.234375q0 -1.234375 0.5 -2.21875q0.515625 -0.984375 1.390625 -1.53125q0.875 -0.546875 1.9375 -0.546875q0.921875 0 1.640625 0.40625q0.734375 0.390625 1.140625 1.03125l0.0625 0l0 -1.171875l1.296875 0l0 7.84375q0 1.953125 -1.09375 2.984375q-1.078125 1.046875 -2.859375 1.046875zm0 -4.796875q0.734375 0 1.328125 -0.359375q0.609375 -0.375 0.96875 -1.0625q0.359375 -0.6875 0.359375 -1.625q0 -0.96875 -0.359375 -1.65625q-0.359375 -0.6875 -0.96875 -1.046875q-0.59375 -0.359375 -1.328125 -0.359375q-0.734375 0 -1.34375 0.375q-0.609375 0.359375 -0.96875 1.046875q-0.359375 0.6875 -0.359375 1.640625q0 0.9375 0.359375 1.640625q0.359375 0.6875 0.96875 1.046875q0.609375 0.359375 1.34375 0.359375zm4.9582825 1.078125l0 0zm4.6495056 -8.15625l1.296875 0l0 1.21875l0.0625 0q0.375 -0.625 1.109375 -1.046875q0.75 -0.4375 1.6875 -0.4375q1.09375 0 1.984375 0.5625q0.890625 0.5625 1.390625 1.5625q0.515625 0.984375 0.515625 2.21875q0 1.25 -0.515625 2.234375q-0.5 0.984375 -1.390625 1.546875q-0.890625 0.546875 -1.984375 0.546875q-0.9375 0 -1.6875 -0.421875q-0.734375 -0.421875 -1.109375 -1.03125l-0.0625 0l0.0625 1.125l0 3.53125l-1.359375 0l0 -11.609375zm4.0 7.1875q0.703125 0 1.328125 -0.390625q0.625 -0.390625 0.984375 -1.09375q0.375 -0.703125 0.375 -1.625q0 -0.9375 -0.375 -1.640625q-0.359375 -0.703125 -0.984375 -1.078125q-0.625 -0.390625 -1.328125 -0.390625q-0.71875 0 -1.34375 0.390625q-0.609375 0.375 -0.984375 1.078125q-0.375 0.703125 -0.375 1.640625q0 0.921875 0.375 1.625q0.375 0.703125 0.984375 1.09375q0.625 0.390625 1.34375 0.390625zm8.239899 1.21875q-0.890625 0 -1.59375 -0.34375q-0.703125 -0.359375 -1.09375 -0.96875q-0.375 -0.625 -0.375 -1.40625q0 -1.296875 0.96875 -2.015625q0.984375 -0.734375 2.46875 -0.734375q0.734375 0 1.359375 0.171875q0.640625 0.15625 0.96875 0.359375l0 -0.5q0 -0.90625 -0.640625 -1.453125q-0.625 -0.5625 -1.609375 -0.5625q-0.671875 0 -1.265625 0.296875q-0.578125 0.296875 -0.90625 0.828125l-1.03125 -0.765625q0.484375 -0.734375 1.328125 -1.15625q0.859375 -0.421875 1.875 -0.421875q1.671875 0 2.609375 0.875q0.9375 0.875 0.9375 2.375l0 5.171875l-1.296875 0l0 -1.171875l-0.0625 0q-0.34375 0.59375 -1.046875 1.015625q-0.703125 0.40625 -1.59375 0.40625zm0.140625 -1.1875q0.6875 0 1.265625 -0.34375q0.59375 -0.359375 0.9375 -0.953125q0.359375 -0.59375 0.359375 -1.296875q-0.375 -0.265625 -0.9375 -0.421875q-0.5625 -0.15625 -1.1875 -0.15625q-1.109375 0 -1.6875 0.46875q-0.5625 0.453125 -0.5625 1.1875q0 0.671875 0.5 1.09375q0.515625 0.421875 1.3125 0.421875zm8.525391 1.1875q-1.3125 0 -2.203125 -0.609375q-0.875 -0.609375 -1.234375 -1.578125l1.203125 -0.546875q0.3125 0.734375 0.90625 1.140625q0.609375 0.40625 1.328125 0.40625q0.765625 0 1.3125 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.515625 -0.4375 -0.828125q-0.4375 -0.3125 -1.359375 -0.53125l-1.0 -0.265625q-0.96875 -0.234375 -1.59375 -0.8125q-0.625 -0.578125 -0.625 -1.484375q0 -0.703125 0.421875 -1.234375q0.421875 -0.546875 1.125 -0.828125q0.703125 -0.296875 1.53125 -0.296875q1.0625 0 1.90625 0.46875q0.84375 0.46875 1.1875 1.296875l-1.171875 0.546875q-0.546875 -1.09375 -1.9375 -1.09375q-0.671875 0 -1.1875 0.3125q-0.5 0.3125 -0.5 0.796875q0 0.453125 0.34375 0.734375q0.359375 0.265625 1.0625 0.453125l1.1875 0.296875q1.203125 0.3125 1.8125 0.90625q0.609375 0.59375 0.609375 1.46875q0 0.75 -0.4375 1.3125q-0.4375 0.5625 -1.171875 0.875q-0.734375 0.296875 -1.625 0.296875zm7.567993 0q-1.3125 0 -2.203125 -0.609375q-0.875 -0.609375 -1.234375 -1.578125l1.203125 -0.546875q0.3125 0.734375 0.90625 1.140625q0.609375 0.40625 1.328125 0.40625q0.765625 0 1.3125 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.515625 -0.4375 -0.828125q-0.4375 -0.3125 -1.359375 -0.53125l-1.0 -0.265625q-0.96875 -0.234375 -1.59375 -0.8125q-0.625 -0.578125 -0.625 -1.484375q0 -0.703125 0.421875 -1.234375q0.421875 -0.546875 1.125 -0.828125q0.703125 -0.296875 1.53125 -0.296875q1.0625 0 1.90625 0.46875q0.84375 0.46875 1.1875 1.296875l-1.171875 0.546875q-0.546875 -1.09375 -1.9375 -1.09375q-0.671875 0 -1.1875 0.3125q-0.5 0.3125 -0.5 0.796875q0 0.453125 0.34375 0.734375q0.359375 0.265625 1.0625 0.453125l1.1875 0.296875q1.203125 0.3125 1.8125 0.90625q0.609375 0.59375 0.609375 1.46875q0 0.75 -0.4375 1.3125q-0.4375 0.5625 -1.171875 0.875q-0.734375 0.296875 -1.625 0.296875zm8.42926 0q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm5.9630127 5.1875q-1.3125 0 -2.203125 -0.609375q-0.875 -0.609375 -1.234375 -1.578125l1.203125 -0.546875q0.3125 0.734375 0.90625 1.140625q0.609375 0.40625 1.328125 0.40625q0.765625 0 1.3125 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.515625 -0.4375 -0.828125q-0.4375 -0.3125 -1.359375 -0.53125l-1.0 -0.265625q-0.96875 -0.234375 -1.59375 -0.8125q-0.625 -0.578125 -0.625 -1.484375q0 -0.703125 0.421875 -1.234375q0.421875 -0.546875 1.125 -0.828125q0.703125 -0.296875 1.53125 -0.296875q1.0625 0 1.90625 0.46875q0.84375 0.46875 1.1875 1.296875l-1.171875 0.546875q-0.546875 -1.09375 -1.9375 -1.09375q-0.671875 0 -1.1875 0.3125q-0.5 0.3125 -0.5 0.796875q0 0.453125 0.34375 0.734375q0.359375 0.265625 1.0625 0.453125l1.1875 0.296875q1.203125 0.3125 1.8125 0.90625q0.609375 0.59375 0.609375 1.46875q0 0.75 -0.4375 1.3125q-0.4375 0.5625 -1.171875 0.875q-0.734375 0.296875 -1.625 0.296875z" fill-rule="nonzero"/><path fill="#f6921e" d="m350.8194 459.83612l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l132.71356 0c2.2234192 0 4.355774 0.88323975 5.927948 2.4554138c1.5721741 1.5722046 2.4554443 3.7045288 2.4554443 5.927948l0 33.53247c0 4.630005 -3.7533875 8.383362 -8.383392 8.383362l-132.71356 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m376.26266 478.44672l1.515625 0.375q-0.46875 1.875 -1.71875 2.859375q-1.234375 0.984375 -3.015625 0.984375q-1.859375 0 -3.015625 -0.75q-1.15625 -0.765625 -1.765625 -2.1875q-0.609375 -1.4375 -0.609375 -3.078125q0 -1.796875 0.6875 -3.125q0.6875 -1.328125 1.9375 -2.015625q1.265625 -0.703125 2.78125 -0.703125q1.71875 0 2.890625 0.875q1.171875 0.875 1.640625 2.46875l-1.5 0.34375q-0.390625 -1.25 -1.15625 -1.8125q-0.75 -0.578125 -1.90625 -0.578125q-1.3125 0 -2.203125 0.640625q-0.890625 0.625 -1.25 1.703125q-0.359375 1.0625 -0.359375 2.1875q0 1.46875 0.421875 2.5625q0.4375 1.078125 1.328125 1.625q0.90625 0.53125 1.953125 0.53125q1.265625 0 2.140625 -0.734375q0.890625 -0.734375 1.203125 -2.171875zm8.6171875 2.984375q-0.78125 0.671875 -1.5 0.953125q-0.71875 0.265625 -1.546875 0.265625q-1.375 0 -2.109375 -0.671875q-0.734375 -0.671875 -0.734375 -1.703125q0 -0.609375 0.28125 -1.109375q0.28125 -0.515625 0.71875 -0.8125q0.453125 -0.3125 1.015625 -0.46875q0.421875 -0.109375 1.25 -0.203125q1.703125 -0.203125 2.515625 -0.484375q0 -0.296875 0 -0.375q0 -0.859375 -0.390625 -1.203125q-0.546875 -0.484375 -1.609375 -0.484375q-0.984375 0 -1.46875 0.359375q-0.46875 0.34375 -0.6875 1.21875l-1.375 -0.1875q0.1875 -0.875 0.609375 -1.421875q0.4375 -0.546875 1.25 -0.828125q0.8125 -0.296875 1.875 -0.296875q1.0625 0 1.71875 0.25q0.671875 0.25 0.984375 0.625q0.3125 0.375 0.4375 0.953125q0.078125 0.359375 0.078125 1.296875l0 1.875q0 1.96875 0.078125 2.484375q0.09375 0.515625 0.359375 1.0l-1.46875 0q-0.21875 -0.4375 -0.28125 -1.03125zm-0.109375 -3.140625q-0.765625 0.3125 -2.296875 0.53125q-0.875 0.125 -1.234375 0.28125q-0.359375 0.15625 -0.5625 0.46875q-0.1875 0.296875 -0.1875 0.65625q0 0.5625 0.421875 0.9375q0.4375 0.375 1.25 0.375q0.8125 0 1.4375 -0.34375q0.640625 -0.359375 0.9375 -0.984375q0.234375 -0.46875 0.234375 -1.40625l0 -0.515625zm3.6015625 4.171875l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0zm8.3671875 -4.15625q0 -2.296875 1.28125 -3.40625q1.078125 -0.921875 2.609375 -0.921875q1.71875 0 2.796875 1.125q1.09375 1.109375 1.09375 3.09375q0 1.59375 -0.484375 2.515625q-0.484375 0.921875 -1.40625 1.4375q-0.90625 0.5 -2.0 0.5q-1.734375 0 -2.8125 -1.109375q-1.078125 -1.125 -1.078125 -3.234375zm1.453125 0q0 1.59375 0.6875 2.390625q0.703125 0.796875 1.75 0.796875q1.046875 0 1.734375 -0.796875q0.703125 -0.796875 0.703125 -2.4375q0 -1.53125 -0.703125 -2.328125q-0.6875 -0.796875 -1.734375 -0.796875q-1.046875 0 -1.75 0.796875q-0.6875 0.78125 -0.6875 2.375zm7.9765625 4.15625l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0zm8.8984375 -9.84375l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm8.9609375 -3.046875l1.390625 0.1875q-0.234375 1.421875 -1.171875 2.234375q-0.921875 0.8125 -2.28125 0.8125q-1.703125 0 -2.75 -1.109375q-1.03125 -1.125 -1.03125 -3.203125q0 -1.34375 0.4375 -2.34375q0.453125 -1.015625 1.359375 -1.515625q0.921875 -0.5 1.984375 -0.5q1.359375 0 2.21875 0.6875q0.859375 0.671875 1.09375 1.9375l-1.359375 0.203125q-0.203125 -0.828125 -0.703125 -1.25q-0.484375 -0.421875 -1.1875 -0.421875q-1.0625 0 -1.734375 0.765625q-0.65625 0.75 -0.65625 2.40625q0 1.671875 0.640625 2.4375q0.640625 0.75 1.671875 0.75q0.828125 0 1.375 -0.5q0.5625 -0.515625 0.703125 -1.578125zm8.0 2.015625q-0.78125 0.671875 -1.5 0.953125q-0.71875 0.265625 -1.546875 0.265625q-1.375 0 -2.109375 -0.671875q-0.734375 -0.671875 -0.734375 -1.703125q0 -0.609375 0.28125 -1.109375q0.28125 -0.515625 0.71875 -0.8125q0.453125 -0.3125 1.015625 -0.46875q0.421875 -0.109375 1.25 -0.203125q1.703125 -0.203125 2.515625 -0.484375q0 -0.296875 0 -0.375q0 -0.859375 -0.390625 -1.203125q-0.546875 -0.484375 -1.609375 -0.484375q-0.984375 0 -1.46875 0.359375q-0.46875 0.34375 -0.6875 1.21875l-1.375 -0.1875q0.1875 -0.875 0.609375 -1.421875q0.4375 -0.546875 1.25 -0.828125q0.8125 -0.296875 1.875 -0.296875q1.0625 0 1.71875 0.25q0.671875 0.25 0.984375 0.625q0.3125 0.375 0.4375 0.953125q0.078125 0.359375 0.078125 1.296875l0 1.875q0 1.96875 0.078125 2.484375q0.09375 0.515625 0.359375 1.0l-1.46875 0q-0.21875 -0.4375 -0.28125 -1.03125zm-0.109375 -3.140625q-0.765625 0.3125 -2.296875 0.53125q-0.875 0.125 -1.234375 0.28125q-0.359375 0.15625 -0.5625 0.46875q-0.1875 0.296875 -0.1875 0.65625q0 0.5625 0.421875 0.9375q0.4375 0.375 1.25 0.375q0.8125 0 1.4375 -0.34375q0.640625 -0.359375 0.9375 -0.984375q0.234375 -0.46875 0.234375 -1.40625l0 -0.515625zm3.5703125 4.171875l0 -11.453125l1.40625 0l0 11.453125l-1.40625 0zm3.5859375 -9.84375l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm2.8046875 0l0 -1.140625l5.28125 -6.0625q-0.890625 0.046875 -1.578125 0.046875l-3.390625 0l0 -1.140625l6.78125 0l0 0.921875l-4.484375 5.265625l-0.875 0.96875q0.953125 -0.078125 1.78125 -0.078125l3.828125 0l0 1.21875l-7.34375 0zm14.15625 -1.03125q-0.78125 0.671875 -1.5 0.953125q-0.71875 0.265625 -1.546875 0.265625q-1.375 0 -2.109375 -0.671875q-0.734375 -0.671875 -0.734375 -1.703125q0 -0.609375 0.28125 -1.109375q0.28125 -0.515625 0.71875 -0.8125q0.453125 -0.3125 1.015625 -0.46875q0.421875 -0.109375 1.25 -0.203125q1.703125 -0.203125 2.515625 -0.484375q0 -0.296875 0 -0.375q0 -0.859375 -0.390625 -1.203125q-0.546875 -0.484375 -1.609375 -0.484375q-0.984375 0 -1.46875 0.359375q-0.46875 0.34375 -0.6875 1.21875l-1.375 -0.1875q0.1875 -0.875 0.609375 -1.421875q0.4375 -0.546875 1.25 -0.828125q0.8125 -0.296875 1.875 -0.296875q1.0625 0 1.71875 0.25q0.671875 0.25 0.984375 0.625q0.3125 0.375 0.4375 0.953125q0.078125 0.359375 0.078125 1.296875l0 1.875q0 1.96875 0.078125 2.484375q0.09375 0.515625 0.359375 1.0l-1.46875 0q-0.21875 -0.4375 -0.28125 -1.03125zm-0.109375 -3.140625q-0.765625 0.3125 -2.296875 0.53125q-0.875 0.125 -1.234375 0.28125q-0.359375 0.15625 -0.5625 0.46875q-0.1875 0.296875 -0.1875 0.65625q0 0.5625 0.421875 0.9375q0.4375 0.375 1.25 0.375q0.8125 0 1.4375 -0.34375q0.640625 -0.359375 0.9375 -0.984375q0.234375 -0.46875 0.234375 -1.40625l0 -0.515625zm6.6640625 2.90625l0.203125 1.25q-0.59375 0.125 -1.0625 0.125q-0.765625 0 -1.1875 -0.234375q-0.421875 -0.25 -0.59375 -0.640625q-0.171875 -0.40625 -0.171875 -1.671875l0 -4.765625l-1.03125 0l0 -1.09375l1.03125 0l0 -2.0625l1.40625 -0.84375l0 2.90625l1.40625 0l0 1.09375l-1.40625 0l0 4.84375q0 0.609375 0.0625 0.78125q0.078125 0.171875 0.25 0.28125q0.171875 0.09375 0.484375 0.09375q0.234375 0 0.609375 -0.0625zm1.3828125 -8.578125l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm3.0234375 -4.15625q0 -2.296875 1.28125 -3.40625q1.078125 -0.921875 2.609375 -0.921875q1.71875 0 2.796875 1.125q1.09375 1.109375 1.09375 3.09375q0 1.59375 -0.484375 2.515625q-0.484375 0.921875 -1.40625 1.4375q-0.90625 0.5 -2.0 0.5q-1.734375 0 -2.8125 -1.109375q-1.078125 -1.125 -1.078125 -3.234375zm1.453125 0q0 1.59375 0.6875 2.390625q0.703125 0.796875 1.75 0.796875q1.046875 0 1.734375 -0.796875q0.703125 -0.796875 0.703125 -2.4375q0 -1.53125 -0.703125 -2.328125q-0.6875 -0.796875 -1.734375 -0.796875q-1.046875 0 -1.75 0.796875q-0.6875 0.78125 -0.6875 2.375zm7.9765625 4.15625l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0z" fill-rule="nonzero"/><path fill="#f6921e" d="m350.8194 530.0994l0 0c0 -4.630005 3.753357 -8.383423 8.383362 -8.383423l132.71356 0c2.2234192 0 4.355774 0.8833008 5.927948 2.4554443c1.5721741 1.5722046 2.4554443 3.7045288 2.4554443 5.9279785l0 33.53247c0 4.630005 -3.7533875 8.383362 -8.383392 8.383362l-132.71356 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m384.0322 552.72565l0 -11.453125l1.515625 0l0 10.09375l5.640625 0l0 1.359375l-7.15625 0zm14.4609375 -2.671875l1.453125 0.171875q-0.34375 1.28125 -1.28125 1.984375q-0.921875 0.703125 -2.359375 0.703125q-1.828125 0 -2.890625 -1.125q-1.0625 -1.125 -1.0625 -3.140625q0 -2.09375 1.078125 -3.25q1.078125 -1.15625 2.796875 -1.15625q1.65625 0 2.703125 1.140625q1.0625 1.125 1.0625 3.171875q0 0.125 0 0.375l-6.1875 0q0.078125 1.375 0.765625 2.109375q0.703125 0.71875 1.734375 0.71875q0.78125 0 1.328125 -0.40625q0.546875 -0.40625 0.859375 -1.296875zm-4.609375 -2.28125l4.625 0q-0.09375 -1.046875 -0.53125 -1.5625q-0.671875 -0.8125 -1.734375 -0.8125q-0.96875 0 -1.640625 0.65625q-0.65625 0.640625 -0.71875 1.71875zm7.5703125 5.640625l1.375 0.203125q0.078125 0.640625 0.46875 0.921875q0.53125 0.390625 1.4375 0.390625q0.96875 0 1.5 -0.390625q0.53125 -0.390625 0.71875 -1.09375q0.109375 -0.421875 0.109375 -1.8125q-0.921875 1.09375 -2.296875 1.09375q-1.71875 0 -2.65625 -1.234375q-0.9375 -1.234375 -0.9375 -2.96875q0 -1.1875 0.421875 -2.1875q0.4375 -1.0 1.25 -1.546875q0.828125 -0.546875 1.921875 -0.546875q1.46875 0 2.421875 1.1875l0 -1.0l1.296875 0l0 7.171875q0 1.9375 -0.390625 2.75q-0.390625 0.8125 -1.25 1.28125q-0.859375 0.46875 -2.109375 0.46875q-1.484375 0 -2.40625 -0.671875q-0.90625 -0.671875 -0.875 -2.015625zm1.171875 -4.984375q0 1.625 0.640625 2.375q0.65625 0.75 1.625 0.75q0.96875 0 1.625 -0.734375q0.65625 -0.75 0.65625 -2.34375q0 -1.53125 -0.671875 -2.296875q-0.671875 -0.78125 -1.625 -0.78125q-0.9375 0 -1.59375 0.765625q-0.65625 0.765625 -0.65625 2.265625zm13.3984375 3.265625q-0.78125 0.671875 -1.5 0.953125q-0.71875 0.265625 -1.546875 0.265625q-1.375 0 -2.109375 -0.671875q-0.734375 -0.671875 -0.734375 -1.703125q0 -0.609375 0.28125 -1.109375q0.28125 -0.515625 0.71875 -0.8125q0.453125 -0.3125 1.015625 -0.46875q0.421875 -0.109375 1.25 -0.203125q1.703125 -0.203125 2.515625 -0.484375q0 -0.296875 0 -0.375q0 -0.859375 -0.390625 -1.203125q-0.546875 -0.484375 -1.609375 -0.484375q-0.984375 0 -1.46875 0.359375q-0.46875 0.34375 -0.6875 1.21875l-1.375 -0.1875q0.1875 -0.875 0.609375 -1.421875q0.4375 -0.546875 1.25 -0.828125q0.8125 -0.296875 1.875 -0.296875q1.0625 0 1.71875 0.25q0.671875 0.25 0.984375 0.625q0.3125 0.375 0.4375 0.953125q0.078125 0.359375 0.078125 1.296875l0 1.875q0 1.96875 0.078125 2.484375q0.09375 0.515625 0.359375 1.0l-1.46875 0q-0.21875 -0.4375 -0.28125 -1.03125zm-0.109375 -3.140625q-0.765625 0.3125 -2.296875 0.53125q-0.875 0.125 -1.234375 0.28125q-0.359375 0.15625 -0.5625 0.46875q-0.1875 0.296875 -0.1875 0.65625q0 0.5625 0.421875 0.9375q0.4375 0.375 1.25 0.375q0.8125 0 1.4375 -0.34375q0.640625 -0.359375 0.9375 -0.984375q0.234375 -0.46875 0.234375 -1.40625l0 -0.515625zm3.5703125 4.171875l0 -11.453125l1.40625 0l0 11.453125l-1.40625 0zm3.5859375 -9.84375l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm2.8046875 0l0 -1.140625l5.28125 -6.0625q-0.890625 0.046875 -1.578125 0.046875l-3.390625 0l0 -1.140625l6.78125 0l0 0.921875l-4.484375 5.265625l-0.875 0.96875q0.953125 -0.078125 1.78125 -0.078125l3.828125 0l0 1.21875l-7.34375 0zm14.15625 -1.03125q-0.78125 0.671875 -1.5 0.953125q-0.71875 0.265625 -1.546875 0.265625q-1.375 0 -2.109375 -0.671875q-0.734375 -0.671875 -0.734375 -1.703125q0 -0.609375 0.28125 -1.109375q0.28125 -0.515625 0.71875 -0.8125q0.453125 -0.3125 1.015625 -0.46875q0.421875 -0.109375 1.25 -0.203125q1.703125 -0.203125 2.515625 -0.484375q0 -0.296875 0 -0.375q0 -0.859375 -0.390625 -1.203125q-0.546875 -0.484375 -1.609375 -0.484375q-0.984375 0 -1.46875 0.359375q-0.46875 0.34375 -0.6875 1.21875l-1.375 -0.1875q0.1875 -0.875 0.609375 -1.421875q0.4375 -0.546875 1.25 -0.828125q0.8125 -0.296875 1.875 -0.296875q1.0625 0 1.71875 0.25q0.671875 0.25 0.984375 0.625q0.3125 0.375 0.4375 0.953125q0.078125 0.359375 0.078125 1.296875l0 1.875q0 1.96875 0.078125 2.484375q0.09375 0.515625 0.359375 1.0l-1.46875 0q-0.21875 -0.4375 -0.28125 -1.03125zm-0.109375 -3.140625q-0.765625 0.3125 -2.296875 0.53125q-0.875 0.125 -1.234375 0.28125q-0.359375 0.15625 -0.5625 0.46875q-0.1875 0.296875 -0.1875 0.65625q0 0.5625 0.421875 0.9375q0.4375 0.375 1.25 0.375q0.8125 0 1.4375 -0.34375q0.640625 -0.359375 0.9375 -0.984375q0.234375 -0.46875 0.234375 -1.40625l0 -0.515625zm6.6640625 2.90625l0.203125 1.25q-0.59375 0.125 -1.0625 0.125q-0.765625 0 -1.1875 -0.234375q-0.421875 -0.25 -0.59375 -0.640625q-0.171875 -0.40625 -0.171875 -1.671875l0 -4.765625l-1.03125 0l0 -1.09375l1.03125 0l0 -2.0625l1.40625 -0.84375l0 2.90625l1.40625 0l0 1.09375l-1.40625 0l0 4.84375q0 0.609375 0.0625 0.78125q0.078125 0.171875 0.25 0.28125q0.171875 0.09375 0.484375 0.09375q0.234375 0 0.609375 -0.0625zm1.3828125 -8.578125l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm3.0234375 -4.15625q0 -2.296875 1.28125 -3.40625q1.078125 -0.921875 2.609375 -0.921875q1.71875 0 2.796875 1.125q1.09375 1.109375 1.09375 3.09375q0 1.59375 -0.484375 2.515625q-0.484375 0.921875 -1.40625 1.4375q-0.90625 0.5 -2.0 0.5q-1.734375 0 -2.8125 -1.109375q-1.078125 -1.125 -1.078125 -3.234375zm1.453125 0q0 1.59375 0.6875 2.390625q0.703125 0.796875 1.75 0.796875q1.046875 0 1.734375 -0.796875q0.703125 -0.796875 0.703125 -2.4375q0 -1.53125 -0.703125 -2.328125q-0.6875 -0.796875 -1.734375 -0.796875q-1.046875 0 -1.75 0.796875q-0.6875 0.78125 -0.6875 2.375zm7.9765625 4.15625l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0z" fill-rule="nonzero"/><path fill="#f6921e" d="m350.8194 600.3627l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l132.71356 0c2.2234192 0 4.355774 0.88323975 5.927948 2.4554443c1.5721741 1.5722046 2.4554443 3.7045288 2.4554443 5.9279175l0 33.53247c0 4.630005 -3.7533875 8.383362 -8.383392 8.383362l-132.71356 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m420.3447 622.9889l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0zm4.4453125 0l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0zm4.4453125 0l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0z" fill-rule="nonzero"/><path fill="#a64d79" d="m541.56616 221.85857l0 0c0 -8.404755 6.8134155 -15.21814 15.21814 -15.21814l117.87866 0c4.036133 0 7.9069214 1.6033325 10.760864 4.4572906c2.8539429 2.8539581 4.4573364 6.7247467 4.4573364 10.760849l0 60.870773c0 8.404755 -6.8134155 15.21817 -15.218201 15.21817l-117.87866 0c-8.404724 0 -15.21814 -6.8134155 -15.21814 -15.21817z" fill-rule="evenodd"/><path fill="#ffffff" d="m599.7218 248.51083q-1.953125 0 -3.5625 -0.921875q-1.59375 -0.921875 -2.515625 -2.515625q-0.921875 -1.59375 -0.921875 -3.53125q0 -1.953125 0.921875 -3.546875q0.921875 -1.59375 2.515625 -2.515625q1.609375 -0.921875 3.5625 -0.921875q3.015625 0 5.0 2.1875l-1.78125 1.71875q-1.28125 -1.515625 -3.203125 -1.515625q-1.25 0 -2.28125 0.578125q-1.015625 0.5625 -1.609375 1.609375q-0.59375 1.03125 -0.59375 2.40625q0 1.359375 0.59375 2.40625q0.59375 1.03125 1.609375 1.609375q1.03125 0.5625 2.28125 0.5625q2.109375 0 3.53125 -1.75l1.78125 1.703125q-0.984375 1.1875 -2.34375 1.8125q-1.34375 0.625 -2.984375 0.625zm11.50946 0q-1.484314 0 -2.640564 -0.65625q-1.15625 -0.671875 -1.796875 -1.8125q-0.640625 -1.15625 -0.640625 -2.578125q0 -1.421875 0.640625 -2.578125q0.640625 -1.15625 1.796875 -1.8125q1.15625 -0.671875 2.640564 -0.671875q1.453125 0 2.609375 0.671875q1.15625 0.65625 1.796875 1.8125q0.640625 1.15625 0.640625 2.578125q0 1.421875 -0.640625 2.578125q-0.640625 1.140625 -1.796875 1.8125q-1.15625 0.65625 -2.609375 0.65625zm0 -2.25q0.703125 0 1.296875 -0.328125q0.609375 -0.34375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.484375q0 -0.84375 -0.359375 -1.46875q-0.34375 -0.640625 -0.953125 -0.96875q-0.59375 -0.34375 -1.296875 -0.34375q-0.71875 0 -1.312439 0.34375q-0.59375 0.328125 -0.96875 0.96875q-0.359375 0.625 -0.359375 1.46875q0 0.828125 0.359375 1.484375q0.375 0.640625 0.96875 0.984375q0.59368896 0.328125 1.312439 0.328125zm11.004883 2.25q-1.28125 0 -2.34375 -0.640625q-1.046875 -0.640625 -1.65625 -1.78125q-0.609375 -1.15625 -0.609375 -2.625q0 -1.46875 0.609375 -2.625q0.609375 -1.15625 1.65625 -1.796875q1.0625 -0.640625 2.34375 -0.640625q0.96875 0 1.71875 0.421875q0.75 0.421875 1.140625 1.03125l0.140625 0l-0.140625 -1.34375l0 -3.65625l2.421875 0l0 13.359375l-2.28125 0l0 -1.140625l-0.140625 0q-0.375 0.625 -1.140625 1.03125q-0.75 0.40625 -1.71875 0.40625zm0.40625 -2.25q0.6875 0 1.28125 -0.359375q0.609375 -0.359375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.453125q0 -0.828125 -0.359375 -1.453125q-0.34375 -0.640625 -0.953125 -0.984375q-0.59375 -0.34375 -1.28125 -0.34375q-0.671875 0 -1.265625 0.34375q-0.59375 0.34375 -0.953125 0.984375q-0.359375 0.625 -0.359375 1.453125q0 0.8125 0.359375 1.453125q0.359375 0.625 0.953125 0.984375q0.59375 0.359375 1.265625 0.359375zm11.647522 2.25q-1.421875 0 -2.5625 -0.640625q-1.140625 -0.65625 -1.78125 -1.796875q-0.640625 -1.15625 -0.640625 -2.59375q0 -1.359375 0.625 -2.53125q0.640625 -1.171875 1.75 -1.859375q1.125 -0.6875 2.515625 -0.6875q1.46875 0 2.53125 0.640625q1.0625 0.625 1.609375 1.734375q0.5625 1.09375 0.5625 2.4375q0 0.421875 -0.0625 0.828125l-7.140625 0q0.1875 1.125 0.90625 1.71875q0.71875 0.59375 1.71875 0.59375q0.84375 0 1.453125 -0.359375q0.609375 -0.375 0.953125 -0.984375l1.984375 0.96875q-1.453125 2.53125 -4.421875 2.53125zm2.25 -6.203125q-0.03125 -0.453125 -0.328125 -0.890625q-0.28125 -0.4375 -0.796875 -0.71875q-0.515625 -0.296875 -1.203125 -0.296875q-0.875 0 -1.5 0.515625q-0.625 0.5 -0.90625 1.390625l4.734375 0zm3.0449219 5.90625l0 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m572.76385 270.51083q-1.921875 0 -3.53125 -0.90625q-1.609375 -0.90625 -2.5625 -2.5q-0.953125 -1.59375 -0.953125 -3.5625q0 -1.984375 0.953125 -3.578125q0.953125 -1.59375 2.5625 -2.5q1.609375 -0.90625 3.53125 -0.90625q1.578125 0 2.875 0.546875q1.296875 0.546875 2.21875 1.5625l-1.734375 1.703125q-0.671875 -0.71875 -1.484375 -1.0625q-0.8125 -0.359375 -1.890625 -0.359375q-1.234375 0 -2.265625 0.578125q-1.015625 0.5625 -1.625 1.625q-0.609375 1.046875 -0.609375 2.390625q0 1.34375 0.609375 2.390625q0.625 1.046875 1.65625 1.625q1.046875 0.5625 2.265625 0.5625q1.875 0 3.0625 -1.09375q0.375 -0.34375 0.65625 -0.859375q0.28125 -0.53125 0.40625 -1.140625l-4.1875 0l0 -2.140625l6.546875 0q0.125 0.515625 0.125 1.171875q0 1.328125 -0.40625 2.46875q-0.390625 1.125 -1.21875 1.984375q-0.890625 0.96875 -2.15625 1.484375q-1.265625 0.515625 -2.84375 0.515625zm13.066711 0q-1.421875 0 -2.5625 -0.640625q-1.140625 -0.65625 -1.78125 -1.796875q-0.640625 -1.15625 -0.640625 -2.59375q0 -1.359375 0.625 -2.53125q0.640625 -1.171875 1.75 -1.859375q1.125 -0.6875 2.515625 -0.6875q1.46875 0 2.53125 0.640625q1.0625 0.625 1.609375 1.734375q0.5625 1.09375 0.5625 2.4375q0 0.421875 -0.0625 0.828125l-7.140625 0q0.1875 1.125 0.90625 1.71875q0.71875 0.59375 1.71875 0.59375q0.84375 0 1.453125 -0.359375q0.609375 -0.375 0.953125 -0.984375l1.984375 0.96875q-1.453125 2.53125 -4.421875 2.53125zm2.25 -6.203125q-0.03125 -0.453125 -0.328125 -0.890625q-0.28125 -0.4375 -0.796875 -0.71875q-0.515625 -0.296875 -1.203125 -0.296875q-0.875 0 -1.5 0.515625q-0.625 0.5 -0.90625 1.390625l4.734375 0zm4.232483 -3.609375l2.296875 0l0 1.1875l0.15625 0q0.421875 -0.6875 1.203125 -1.078125q0.796875 -0.40625 1.703125 -0.40625q1.71875 0 2.609375 1.046875q0.90625 1.046875 0.90625 2.859375l0 5.90625l-2.453125 0l0 -5.609375q0 -0.921875 -0.46875 -1.421875q-0.46875 -0.5 -1.296875 -0.5q-1.0 0 -1.609375 0.765625q-0.59375 0.765625 -0.59375 1.921875l0 4.84375l-2.453125 0l0 -9.515625zm15.64386 9.8125q-1.421875 0 -2.5625 -0.640625q-1.140625 -0.65625 -1.78125 -1.796875q-0.640625 -1.15625 -0.640625 -2.59375q0 -1.359375 0.625 -2.53125q0.640625 -1.171875 1.75 -1.859375q1.125 -0.6875 2.515625 -0.6875q1.46875 0 2.53125 0.640625q1.0625 0.625 1.609375 1.734375q0.5625 1.09375 0.5625 2.4375q0 0.421875 -0.0625 0.828125l-7.140625 0q0.1875 1.125 0.90625 1.71875q0.71875 0.59375 1.71875 0.59375q0.84375 0 1.453125 -0.359375q0.609375 -0.375 0.953125 -0.984375l1.984375 0.96875q-1.453125 2.53125 -4.421875 2.53125zm2.25 -6.203125q-0.03125 -0.453125 -0.328125 -0.890625q-0.28125 -0.4375 -0.796875 -0.71875q-0.515625 -0.296875 -1.203125 -0.296875q-0.875 0 -1.5 0.515625q-0.625 0.5 -0.90625 1.390625l4.734375 0zm4.232422 -3.609375l2.296875 0l0 1.265625l0.15625 0q0.359375 -0.671875 1.09375 -1.109375q0.75 -0.4375 1.640625 -0.4375q0.65625 0 1.203125 0.203125l0 2.453125q-0.5 -0.1875 -0.859375 -0.265625q-0.34375 -0.09375 -0.734375 -0.09375q-1.078125 0 -1.71875 0.78125q-0.625 0.78125 -0.625 1.984375l0 4.734375l-2.453125 0l0 -9.515625zm10.589478 9.8125q-1.515625 0 -2.484375 -0.875q-0.96875 -0.890625 -0.96875 -2.328125q0 -0.953125 0.5 -1.671875q0.5 -0.734375 1.375 -1.125q0.890625 -0.390625 1.953125 -0.390625q1.46875 0 2.515625 0.421875l0 -0.40625q0 -0.78125 -0.59375 -1.265625q-0.578125 -0.484375 -1.578125 -0.484375q-0.671875 0 -1.296875 0.3125q-0.625 0.296875 -1.03125 0.796875l-1.5625 -1.234375q0.6875 -0.890625 1.734375 -1.375q1.046875 -0.484375 2.265625 -0.484375q2.1875 0 3.328125 1.015625q1.140625 1.0 1.140625 2.9375l0 5.859375l-2.40625 0l0 -0.96875l-0.15625 0q-0.421875 0.5625 -1.125 0.921875q-0.703125 0.34375 -1.609375 0.34375zm0.578125 -1.90625q1.0625 0 1.6875 -0.671875q0.625 -0.6875 0.625 -1.59375q-0.953125 -0.453125 -2.0 -0.453125q-1.90625 0 -1.90625 1.421875q0 0.578125 0.40625 0.9375q0.421875 0.359375 1.1875 0.359375zm10.920105 1.765625q-0.703125 0 -1.296875 -0.21875q-0.59375 -0.21875 -0.96875 -0.59375q-0.875 -0.84375 -0.875 -2.390625l0 -4.375l-1.671875 0l0 -2.09375l1.671875 0l0 -2.6875l2.4375 0l0 2.6875l2.328125 0l0 2.09375l-2.328125 0l0 3.9375q0 0.71875 0.3125 1.046875q0.265625 0.296875 0.90625 0.296875q0.34375 0 0.578125 -0.09375q0.234375 -0.09375 0.609375 -0.328125l0 2.390625q-0.796875 0.328125 -1.703125 0.328125zm5.1416626 -10.71875q-0.65625 0 -1.125 -0.46875q-0.46875 -0.46875 -0.46875 -1.109375q0 -0.65625 0.46875 -1.109375q0.46875 -0.46875 1.125 -0.46875q0.65625 0 1.109375 0.46875q0.453125 0.453125 0.453125 1.109375q0 0.640625 -0.453125 1.109375q-0.453125 0.46875 -1.109375 0.46875zm-1.234375 1.046875l2.4375 0l0 9.515625l-2.4375 0l0 -9.515625zm9.443359 9.8125q-1.484375 0 -2.640625 -0.65625q-1.15625 -0.671875 -1.796875 -1.8125q-0.640625 -1.15625 -0.640625 -2.578125q0 -1.421875 0.640625 -2.578125q0.640625 -1.15625 1.796875 -1.8125q1.15625 -0.671875 2.640625 -0.671875q1.453125 0 2.609375 0.671875q1.15625 0.65625 1.796875 1.8125q0.640625 1.15625 0.640625 2.578125q0 1.421875 -0.640625 2.578125q-0.640625 1.140625 -1.796875 1.8125q-1.15625 0.65625 -2.609375 0.65625zm0 -2.25q0.703125 0 1.296875 -0.328125q0.609375 -0.34375 0.953125 -0.984375q0.359375 -0.640625 0.359375 -1.484375q0 -0.84375 -0.359375 -1.46875q-0.34375 -0.640625 -0.953125 -0.96875q-0.59375 -0.34375 -1.296875 -0.34375q-0.71875 0 -1.3125 0.34375q-0.59375 0.328125 -0.96875 0.96875q-0.359375 0.625 -0.359375 1.46875q0 0.828125 0.359375 1.484375q0.375 0.640625 0.96875 0.984375q0.59375 0.328125 1.3125 0.328125zm6.7244263 -7.5625l2.296875 0l0 1.1875l0.15625 0q0.421875 -0.6875 1.203125 -1.078125q0.796875 -0.40625 1.703125 -0.40625q1.71875 0 2.609375 1.046875q0.90625 1.046875 0.90625 2.859375l0 5.90625l-2.453125 0l0 -5.609375q0 -0.921875 -0.46875 -1.421875q-0.46875 -0.5 -1.296875 -0.5q-1.0 0 -1.609375 0.765625q-0.59375 0.765625 -0.59375 1.921875l0 4.84375l-2.453125 0l0 -9.515625z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m501.4901 252.29396l40.06299 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m501.4901 252.29396l28.062988 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="2.0" stroke-linecap="butt" d="m529.5531 255.59743l9.076233 -3.3034668l-9.076233 -3.3034668z" fill-rule="evenodd"/><path fill="#a64d79" d="m540.3784 322.12283l0 0c0 -4.6300354 3.753357 -8.383392 8.383362 -8.383392l131.54822 0c2.2233887 0 4.355774 0.88323975 5.9279175 2.4554443c1.5722046 1.5721741 2.4554443 3.7045288 2.4554443 5.927948l0 33.53247c0 4.630005 -3.753357 8.383362 -8.383362 8.383362l-131.54822 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m595.36084 325.0928l-3.203125 0l0 -1.296875l7.765625 0l0 1.296875l-3.203125 0l0 10.15625l-1.359375 0l0 -10.15625zm7.392395 10.40625q-0.890625 0 -1.59375 -0.34375q-0.703125 -0.359375 -1.09375 -0.96875q-0.375 -0.625 -0.375 -1.40625q0 -1.296875 0.96875 -2.015625q0.984375 -0.734375 2.46875 -0.734375q0.734375 0 1.359375 0.171875q0.640625 0.15625 0.96875 0.359375l0 -0.5q0 -0.90625 -0.640625 -1.453125q-0.625 -0.5625 -1.609375 -0.5625q-0.671875 0 -1.265625 0.296875q-0.578125 0.296875 -0.90625 0.828125l-1.03125 -0.765625q0.484375 -0.734375 1.328125 -1.15625q0.859375 -0.421875 1.875 -0.421875q1.671875 0 2.609375 0.875q0.9375 0.875 0.9375 2.375l0 5.171875l-1.296875 0l0 -1.171875l-0.0625 0q-0.34375 0.59375 -1.046875 1.015625q-0.703125 0.40625 -1.59375 0.40625zm0.140625 -1.1875q0.6875 0 1.265625 -0.34375q0.59375 -0.359375 0.9375 -0.953125q0.359375 -0.59375 0.359375 -1.296875q-0.375 -0.265625 -0.9375 -0.421875q-0.5625 -0.15625 -1.1875 -0.15625q-1.109375 0 -1.6875 0.46875q-0.5625 0.453125 -0.5625 1.1875q0 0.671875 0.5 1.09375q0.515625 0.421875 1.3125 0.421875zm5.7616577 -7.21875l1.28125 0l0 1.3125l0.078125 0q0.234375 -0.671875 0.921875 -1.109375q0.6875 -0.453125 1.4375 -0.453125q0.5625 0 0.96875 0.171875l0 1.46875q-0.515625 -0.265625 -1.15625 -0.265625q-0.59375 0 -1.09375 0.34375q-0.5 0.328125 -0.796875 0.90625q-0.28125 0.5625 -0.28125 1.21875l0 4.5625l-1.359375 0l0 -8.15625zm9.178162 11.875q-1.4375 0 -2.375 -0.671875q-0.9375 -0.671875 -1.265625 -1.625l1.25 -0.53125q0.265625 0.703125 0.890625 1.140625q0.640625 0.453125 1.5 0.453125q1.265625 0 1.953125 -0.734375q0.703125 -0.734375 0.703125 -2.09375l0 -0.90625l-0.0625 0q-0.40625 0.625 -1.140625 1.015625q-0.71875 0.390625 -1.640625 0.390625q-1.0625 0 -1.9375 -0.53125q-0.875 -0.546875 -1.390625 -1.515625q-0.5 -0.984375 -0.5 -2.234375q0 -1.234375 0.5 -2.21875q0.515625 -0.984375 1.390625 -1.53125q0.875 -0.546875 1.9375 -0.546875q0.921875 0 1.640625 0.40625q0.734375 0.390625 1.140625 1.03125l0.0625 0l0 -1.171875l1.296875 0l0 7.84375q0 1.953125 -1.09375 2.984375q-1.078125 1.046875 -2.859375 1.046875zm0 -4.796875q0.734375 0 1.328125 -0.359375q0.609375 -0.375 0.96875 -1.0625q0.359375 -0.6875 0.359375 -1.625q0 -0.96875 -0.359375 -1.65625q-0.359375 -0.6875 -0.96875 -1.046875q-0.59375 -0.359375 -1.328125 -0.359375q-0.734375 0 -1.34375 0.375q-0.609375 0.359375 -0.96875 1.046875q-0.359375 0.6875 -0.359375 1.640625q0 0.9375 0.359375 1.640625q0.359375 0.6875 0.96875 1.046875q0.609375 0.359375 1.34375 0.359375zm9.51886 1.328125q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm5.9242554 5.0625q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm1.7153931 -0.125l0 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m570.7174 354.49905q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm5.6279297 -7.1875l1.296875 0l0 1.21875l0.0625 0q0.375 -0.625 1.109375 -1.046875q0.75 -0.4375 1.6875 -0.4375q1.09375 0 1.984375 0.5625q0.890625 0.5625 1.390625 1.5625q0.515625 0.984375 0.515625 2.21875q0 1.25 -0.515625 2.234375q-0.5 0.984375 -1.390625 1.546875q-0.890625 0.546875 -1.984375 0.546875q-0.9375 0 -1.6875 -0.421875q-0.734375 -0.421875 -1.109375 -1.03125l-0.0625 0l0.0625 1.125l0 3.53125l-1.359375 0l0 -11.609375zm4.0 7.1875q0.703125 0 1.328125 -0.390625q0.625 -0.390625 0.984375 -1.09375q0.375 -0.703125 0.375 -1.625q0 -0.9375 -0.375 -1.640625q-0.359375 -0.703125 -0.984375 -1.078125q-0.625 -0.390625 -1.328125 -0.390625q-0.71875 0 -1.34375 0.390625q-0.609375 0.375 -0.984375 1.078125q-0.375 0.703125 -0.375 1.640625q0 0.921875 0.375 1.625q0.375 0.703125 0.984375 1.09375q0.625 0.390625 1.34375 0.390625zm8.451111 1.09375q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm3.6079102 -9.78125q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm3.4910278 0l1.28125 0l0 1.203125l0.078125 0q0.34375 -0.625 1.046875 -1.046875q0.703125 -0.421875 1.484375 -0.421875q0.90625 0 1.5625 0.4375q0.671875 0.4375 0.953125 1.15625q0.4375 -0.71875 1.140625 -1.15625q0.71875 -0.4375 1.65625 -0.4375q1.40625 0 2.125 0.859375q0.71875 0.859375 0.71875 2.328125l0 5.234375l-1.34375 0l0 -5.03125q0 -1.125 -0.46875 -1.640625q-0.46875 -0.515625 -1.375 -0.515625q-0.609375 0 -1.109375 0.359375q-0.5 0.34375 -0.78125 0.9375q-0.265625 0.59375 -0.265625 1.28125l0 4.609375l-1.359375 0l0 -5.015625q0 -1.140625 -0.46875 -1.65625q-0.46875 -0.515625 -1.359375 -0.515625q-0.609375 0 -1.109375 0.359375q-0.5 0.359375 -0.78125 0.96875q-0.265625 0.59375 -0.265625 1.28125l0 4.578125l-1.359375 0l0 -8.15625zm14.812866 -1.5q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm3.0691528 6.984375l4.8125 -5.75l-4.671875 0l0 -1.234375l6.34375 0l0 1.171875l-4.828125 5.75l4.953125 0l0 1.234375l-6.609375 0l0 -1.171875zm10.647644 1.421875q-0.890625 0 -1.59375 -0.34375q-0.703125 -0.359375 -1.09375 -0.96875q-0.375 -0.625 -0.375 -1.40625q0 -1.296875 0.96875 -2.015625q0.984375 -0.734375 2.46875 -0.734375q0.734375 0 1.359375 0.171875q0.640625 0.15625 0.96875 0.359375l0 -0.5q0 -0.90625 -0.640625 -1.453125q-0.625 -0.5625 -1.609375 -0.5625q-0.671875 0 -1.265625 0.296875q-0.578125 0.296875 -0.90625 0.828125l-1.03125 -0.765625q0.484375 -0.734375 1.328125 -1.15625q0.859375 -0.421875 1.875 -0.421875q1.671875 0 2.609375 0.875q0.9375 0.875 0.9375 2.375l0 5.171875l-1.296875 0l0 -1.171875l-0.0625 0q-0.34375 0.59375 -1.046875 1.015625q-0.703125 0.40625 -1.59375 0.40625zm0.140625 -1.1875q0.6875 0 1.265625 -0.34375q0.59375 -0.359375 0.9375 -0.953125q0.359375 -0.59375 0.359375 -1.296875q-0.375 -0.265625 -0.9375 -0.421875q-0.5625 -0.15625 -1.1875 -0.15625q-1.109375 0 -1.6875 0.46875q-0.5625 0.453125 -0.5625 1.1875q0 0.671875 0.5 1.09375q0.515625 0.421875 1.3125 0.421875zm8.726624 1.0625q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm3.6079102 -9.78125q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm7.1610107 8.40625q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm5.6260376 -7.1875l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.46875 0 2.234375 0.859375q0.78125 0.859375 0.78125 2.328125l0 5.234375l-1.359375 0l0 -5.03125q0 -1.125 -0.546875 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625zm11.739746 8.40625q-1.3125 0 -2.203125 -0.609375q-0.875 -0.609375 -1.234375 -1.578125l1.203125 -0.546875q0.3125 0.734375 0.90625 1.140625q0.609375 0.40625 1.328125 0.40625q0.765625 0 1.3125 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.515625 -0.4375 -0.828125q-0.4375 -0.3125 -1.359375 -0.53125l-1.0 -0.265625q-0.96875 -0.234375 -1.59375 -0.8125q-0.625 -0.578125 -0.625 -1.484375q0 -0.703125 0.421875 -1.234375q0.421875 -0.546875 1.125 -0.828125q0.703125 -0.296875 1.53125 -0.296875q1.0625 0 1.90625 0.46875q0.84375 0.46875 1.1875 1.296875l-1.171875 0.546875q-0.546875 -1.09375 -1.9375 -1.09375q-0.671875 0 -1.1875 0.3125q-0.5 0.3125 -0.5 0.796875q0 0.453125 0.34375 0.734375q0.359375 0.265625 1.0625 0.453125l1.1875 0.296875q1.203125 0.3125 1.8125 0.90625q0.609375 0.59375 0.609375 1.46875q0 0.75 -0.4375 1.3125q-0.4375 0.5625 -1.171875 0.875q-0.734375 0.296875 -1.625 0.296875z" fill-rule="nonzero"/><path fill="#a64d79" d="m540.3784 392.38617l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l131.54822 0c2.2233887 0 4.355774 0.88323975 5.9279175 2.4554138c1.5722046 1.5722046 2.4554443 3.7045288 2.4554443 5.927948l0 33.53247c0 4.630005 -3.753357 8.383362 -8.383362 8.383362l-131.54822 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m564.0176 415.26242q-1.65625 0 -3.0 -0.796875q-1.34375 -0.796875 -2.109375 -2.171875q-0.765625 -1.375 -0.765625 -3.015625q0 -1.625 0.765625 -3.0q0.765625 -1.375 2.109375 -2.171875q1.34375 -0.8125 3.0 -0.8125q1.640625 0 2.984375 0.8125q1.34375 0.796875 2.109375 2.171875q0.78125 1.375 0.78125 3.0q0 1.640625 -0.78125 3.015625q-0.765625 1.375 -2.109375 2.171875q-1.34375 0.796875 -2.984375 0.796875zm0 -1.296875q1.234375 0 2.265625 -0.59375q1.03125 -0.59375 1.625 -1.65625q0.609375 -1.078125 0.609375 -2.4375q0 -1.359375 -0.609375 -2.421875q-0.59375 -1.0625 -1.625 -1.65625q-1.03125 -0.609375 -2.265625 -0.609375q-1.234375 0 -2.265625 0.609375q-1.03125 0.59375 -1.640625 1.65625q-0.59375 1.0625 -0.59375 2.421875q0 1.359375 0.59375 2.4375q0.609375 1.0625 1.640625 1.65625q1.03125 0.59375 2.265625 0.59375zm7.527771 -7.109375l1.296875 0l0 1.21875l0.0625 0q0.375 -0.625 1.109375 -1.046875q0.75 -0.4375 1.6875 -0.4375q1.09375 0 1.984375 0.5625q0.890625 0.5625 1.390625 1.5625q0.515625 0.984375 0.515625 2.21875q0 1.25 -0.515625 2.234375q-0.5 0.984375 -1.390625 1.546875q-0.890625 0.546875 -1.984375 0.546875q-0.9375 0 -1.6875 -0.421875q-0.734375 -0.421875 -1.109375 -1.03125l-0.0625 0l0.0625 1.125l0 3.53125l-1.359375 0l0 -11.609375zm4.0 7.1875q0.703125 0 1.328125 -0.390625q0.625 -0.390625 0.984375 -1.09375q0.375 -0.703125 0.375 -1.625q0 -0.9375 -0.375 -1.640625q-0.359375 -0.703125 -0.984375 -1.078125q-0.625 -0.390625 -1.328125 -0.390625q-0.71875 0 -1.34375 0.390625q-0.609375 0.375 -0.984375 1.078125q-0.375 0.703125 -0.375 1.640625q0 0.921875 0.375 1.625q0.375 0.703125 0.984375 1.09375q0.625 0.390625 1.34375 0.390625zm9.271118 1.21875q-1.171875 0 -2.109375 -0.546875q-0.921875 -0.5625 -1.4375 -1.546875q-0.515625 -0.984375 -0.515625 -2.21875q0 -1.171875 0.484375 -2.171875q0.484375 -1.0 1.390625 -1.59375q0.90625 -0.59375 2.078125 -0.59375q1.203125 0 2.078125 0.546875q0.875 0.53125 1.34375 1.484375q0.484375 0.9375 0.484375 2.15625q0 0.1875 -0.03125 0.40625l-6.46875 0q0.046875 0.9375 0.4375 1.578125q0.40625 0.640625 1.015625 0.96875q0.625 0.3125 1.296875 0.3125q1.59375 0 2.421875 -1.484375l1.140625 0.5625q-0.5 0.984375 -1.421875 1.5625q-0.90625 0.578125 -2.1875 0.578125zm2.34375 -5.1875q-0.03125 -0.515625 -0.28125 -1.03125q-0.25 -0.515625 -0.796875 -0.859375q-0.546875 -0.359375 -1.375 -0.359375q-0.96875 0 -1.640625 0.625q-0.671875 0.609375 -0.875 1.625l4.96875 0zm3.0392456 -3.21875l1.28125 0l0 1.3125l0.078125 0q0.234375 -0.671875 0.921875 -1.109375q0.6875 -0.453125 1.4375 -0.453125q0.5625 0 0.96875 0.171875l0 1.46875q-0.515625 -0.265625 -1.15625 -0.265625q-0.59375 0 -1.09375 0.34375q-0.5 0.328125 -0.796875 0.90625q-0.28125 0.5625 -0.28125 1.21875l0 4.5625l-1.359375 0l0 -8.15625zm8.273804 8.40625q-0.890625 0 -1.59375 -0.34375q-0.703125 -0.359375 -1.09375 -0.96875q-0.375 -0.625 -0.375 -1.40625q0 -1.296875 0.96875 -2.015625q0.984375 -0.734375 2.46875 -0.734375q0.734375 0 1.359375 0.171875q0.640625 0.15625 0.96875 0.359375l0 -0.5q0 -0.90625 -0.640625 -1.453125q-0.625 -0.5625 -1.609375 -0.5625q-0.671875 0 -1.265625 0.296875q-0.578125 0.296875 -0.90625 0.828125l-1.03125 -0.765625q0.484375 -0.734375 1.328125 -1.15625q0.859375 -0.421875 1.875 -0.421875q1.671875 0 2.609375 0.875q0.9375 0.875 0.9375 2.375l0 5.171875l-1.296875 0l0 -1.171875l-0.0625 0q-0.34375 0.59375 -1.046875 1.015625q-0.703125 0.40625 -1.59375 0.40625zm0.140625 -1.1875q0.6875 0 1.265625 -0.34375q0.59375 -0.359375 0.9375 -0.953125q0.359375 -0.59375 0.359375 -1.296875q-0.375 -0.265625 -0.9375 -0.421875q-0.5625 -0.15625 -1.1875 -0.15625q-1.109375 0 -1.6875 0.46875q-0.5625 0.453125 -0.5625 1.1875q0 0.671875 0.5 1.09375q0.515625 0.421875 1.3125 0.421875zm8.726624 1.0625q-0.5 0 -0.921875 -0.15625q-0.421875 -0.15625 -0.703125 -0.4375q-0.640625 -0.609375 -0.640625 -1.671875l0 -4.78125l-1.421875 0l0 -1.234375l1.421875 0l0 -2.3125l1.359375 0l0 2.3125l2.0 0l0 1.234375l-2.0 0l0 4.4375q0 0.671875 0.25 1.0q0.3125 0.34375 0.890625 0.34375q0.484375 0 0.890625 -0.265625l0 1.328125q-0.234375 0.109375 -0.484375 0.15625q-0.25 0.046875 -0.640625 0.046875zm6.1290283 0.125q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm5.6259766 -7.1875l1.28125 0l0 1.3125l0.078125 0q0.234375 -0.671875 0.921875 -1.109375q0.6875 -0.453125 1.4375 -0.453125q0.5625 0 0.96875 0.171875l0 1.46875q-0.515625 -0.265625 -1.15625 -0.265625q-0.59375 0 -1.09375 0.34375q-0.5 0.328125 -0.796875 0.90625q-0.28125 0.5625 -0.28125 1.21875l0 4.5625l-1.359375 0l0 -8.15625zm4.984436 8.15625l0 0zm5.345093 -6.921875l-1.5 0l0 -1.234375l1.5 0l0 -0.890625q0 -0.765625 0.34375 -1.328125q0.359375 -0.578125 0.953125 -0.890625q0.59375 -0.3125 1.3125 -0.3125q0.6875 0 1.203125 0.1875l0 1.328125q-0.296875 -0.109375 -0.5625 -0.171875q-0.25 -0.078125 -0.59375 -0.078125q-0.53125 0 -0.921875 0.375q-0.375 0.375 -0.375 1.046875l0 0.734375l2.09375 0l0 1.234375l-2.09375 0l0 6.921875l-1.359375 0l0 -6.921875zm7.9196167 7.171875q-1.46875 0 -2.25 -0.859375q-0.78125 -0.859375 -0.78125 -2.421875l0 -5.125l1.359375 0l0 4.921875q0 1.171875 0.53125 1.71875q0.53125 0.546875 1.421875 0.546875q0.6875 0 1.21875 -0.375q0.546875 -0.375 0.84375 -0.953125q0.296875 -0.59375 0.296875 -1.25l0 -4.609375l1.359375 0l0 8.15625l-1.296875 0l0 -1.1875l-0.0625 0q-0.34375 0.609375 -1.078125 1.03125q-0.734375 0.40625 -1.5625 0.40625zm8.786682 0q-1.3125 0 -2.203125 -0.609375q-0.875 -0.609375 -1.234375 -1.578125l1.203125 -0.546875q0.3125 0.734375 0.90625 1.140625q0.609375 0.40625 1.328125 0.40625q0.765625 0 1.3125 -0.3125q0.546875 -0.3125 0.546875 -0.890625q0 -0.515625 -0.4375 -0.828125q-0.4375 -0.3125 -1.359375 -0.53125l-1.0 -0.265625q-0.96875 -0.234375 -1.59375 -0.8125q-0.625 -0.578125 -0.625 -1.484375q0 -0.703125 0.421875 -1.234375q0.421875 -0.546875 1.125 -0.828125q0.703125 -0.296875 1.53125 -0.296875q1.0625 0 1.90625 0.46875q0.84375 0.46875 1.1875 1.296875l-1.171875 0.546875q-0.546875 -1.09375 -1.9375 -1.09375q-0.671875 0 -1.1875 0.3125q-0.5 0.3125 -0.5 0.796875q0 0.453125 0.34375 0.734375q0.359375 0.265625 1.0625 0.453125l1.1875 0.296875q1.203125 0.3125 1.8125 0.90625q0.609375 0.59375 0.609375 1.46875q0 0.75 -0.4375 1.3125q-0.4375 0.5625 -1.171875 0.875q-0.734375 0.296875 -1.625 0.296875zm5.441101 -9.90625q-0.40625 0 -0.703125 -0.28125q-0.28125 -0.296875 -0.28125 -0.703125q0 -0.421875 0.28125 -0.6875q0.296875 -0.28125 0.703125 -0.28125q0.40625 0 0.6875 0.28125q0.28125 0.265625 0.28125 0.6875q0 0.40625 -0.28125 0.703125q-0.28125 0.28125 -0.6875 0.28125zm-0.671875 1.5l1.359375 0l0 8.15625l-1.359375 0l0 -8.15625zm7.1610107 8.40625q-1.203125 0 -2.171875 -0.5625q-0.953125 -0.578125 -1.484375 -1.5625q-0.53125 -1.0 -0.53125 -2.203125q0 -1.203125 0.53125 -2.1875q0.53125 -1.0 1.484375 -1.578125q0.96875 -0.578125 2.171875 -0.578125q1.203125 0 2.15625 0.578125q0.96875 0.578125 1.5 1.578125q0.53125 0.984375 0.53125 2.1875q0 1.203125 -0.53125 2.203125q-0.53125 0.984375 -1.5 1.5625q-0.953125 0.5625 -2.15625 0.5625zm0 -1.21875q0.75 0 1.390625 -0.375q0.65625 -0.390625 1.046875 -1.09375q0.390625 -0.703125 0.390625 -1.640625q0 -0.953125 -0.390625 -1.65625q-0.390625 -0.703125 -1.046875 -1.078125q-0.640625 -0.375 -1.390625 -0.375q-0.75 0 -1.40625 0.375q-0.65625 0.375 -1.046875 1.078125q-0.390625 0.703125 -0.390625 1.65625q0 0.9375 0.390625 1.640625q0.390625 0.703125 1.046875 1.09375q0.65625 0.375 1.40625 0.375zm5.6260376 -7.1875l1.28125 0l0 1.203125l0.078125 0q0.328125 -0.609375 1.0625 -1.03125q0.734375 -0.4375 1.578125 -0.4375q1.46875 0 2.234375 0.859375q0.78125 0.859375 0.78125 2.328125l0 5.234375l-1.359375 0l0 -5.03125q0 -1.125 -0.546875 -1.640625q-0.546875 -0.515625 -1.484375 -0.515625q-0.65625 0 -1.1875 0.375q-0.515625 0.359375 -0.796875 0.96875q-0.28125 0.59375 -0.28125 1.25l0 4.59375l-1.359375 0l0 -8.15625z" fill-rule="nonzero"/><path fill="#a64d79" d="m540.3784 462.64954l0 0c0 -4.630005 3.753357 -8.383392 8.383362 -8.383392l131.54822 0c2.2233887 0 4.355774 0.88327026 5.9279175 2.4554443c1.5722046 1.5721741 2.4554443 3.7045288 2.4554443 5.927948l0 33.53247c0 4.630005 -3.753357 8.383362 -8.383362 8.383362l-131.54822 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m558.7859 485.27576l0 -11.453125l1.515625 0l0 5.671875l5.6875 -5.671875l2.0625 0l-4.8125 4.640625l5.015625 6.8125l-2.0 0l-4.078125 -5.796875l-1.875 1.828125l0 3.96875l-1.515625 0zm16.234375 -2.671875l1.453125 0.171875q-0.34375 1.28125 -1.28125 1.984375q-0.921875 0.703125 -2.359375 0.703125q-1.828125 0 -2.890625 -1.125q-1.0625 -1.125 -1.0625 -3.140625q0 -2.09375 1.078125 -3.25q1.078125 -1.15625 2.796875 -1.15625q1.65625 0 2.703125 1.140625q1.0625 1.125 1.0625 3.171875q0 0.125 0 0.375l-6.1875 0q0.078125 1.375 0.765625 2.109375q0.703125 0.71875 1.734375 0.71875q0.78125 0 1.328125 -0.40625q0.546875 -0.40625 0.859375 -1.296875zm-4.609375 -2.28125l4.625 0q-0.09375 -1.046875 -0.53125 -1.5625q-0.671875 -0.8125 -1.734375 -0.8125q-0.96875 0 -1.640625 0.65625q-0.65625 0.640625 -0.71875 1.71875zm7.8203125 4.953125l0 -8.296875l1.265625 0l0 1.25q0.484375 -0.875 0.890625 -1.15625q0.40625 -0.28125 0.90625 -0.28125q0.703125 0 1.4375 0.453125l-0.484375 1.296875q-0.515625 -0.296875 -1.03125 -0.296875q-0.453125 0 -0.828125 0.28125q-0.359375 0.265625 -0.515625 0.765625q-0.234375 0.75 -0.234375 1.640625l0 4.34375l-1.40625 0zm5.34375 0l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0zm14.5703125 -2.671875l1.453125 0.171875q-0.34375 1.28125 -1.28125 1.984375q-0.921875 0.703125 -2.359375 0.703125q-1.828125 0 -2.890625 -1.125q-1.0625 -1.125 -1.0625 -3.140625q0 -2.09375 1.078125 -3.25q1.078125 -1.15625 2.796875 -1.15625q1.65625 0 2.703125 1.140625q1.0625 1.125 1.0625 3.171875q0 0.125 0 0.375l-6.1875 0q0.078125 1.375 0.765625 2.109375q0.703125 0.71875 1.734375 0.71875q0.78125 0 1.328125 -0.40625q0.546875 -0.40625 0.859375 -1.296875zm-4.609375 -2.28125l4.625 0q-0.09375 -1.046875 -0.53125 -1.5625q-0.671875 -0.8125 -1.734375 -0.8125q-0.96875 0 -1.640625 0.65625q-0.65625 0.640625 -0.71875 1.71875zm7.8046875 4.953125l0 -11.453125l1.40625 0l0 11.453125l-1.40625 0zm7.46875 -2.484375l1.390625 -0.21875q0.109375 0.84375 0.640625 1.296875q0.546875 0.4375 1.5 0.4375q0.96875 0 1.4375 -0.390625q0.46875 -0.40625 0.46875 -0.9375q0 -0.46875 -0.40625 -0.75q-0.296875 -0.1875 -1.4375 -0.46875q-1.546875 -0.390625 -2.15625 -0.671875q-0.59375 -0.296875 -0.90625 -0.796875q-0.296875 -0.5 -0.296875 -1.109375q0 -0.5625 0.25 -1.03125q0.25 -0.46875 0.6875 -0.78125q0.328125 -0.25 0.890625 -0.40625q0.578125 -0.171875 1.21875 -0.171875q0.984375 0 1.71875 0.28125q0.734375 0.28125 1.078125 0.765625q0.359375 0.46875 0.5 1.28125l-1.375 0.1875q-0.09375 -0.640625 -0.546875 -1.0q-0.453125 -0.359375 -1.265625 -0.359375q-0.96875 0 -1.390625 0.328125q-0.40625 0.3125 -0.40625 0.734375q0 0.28125 0.171875 0.5q0.171875 0.21875 0.53125 0.375q0.21875 0.078125 1.25 0.359375q1.484375 0.390625 2.078125 0.65625q0.59375 0.25 0.921875 0.734375q0.34375 0.484375 0.34375 1.203125q0 0.703125 -0.421875 1.328125q-0.40625 0.609375 -1.1875 0.953125q-0.765625 0.34375 -1.734375 0.34375q-1.625 0 -2.46875 -0.671875q-0.84375 -0.671875 -1.078125 -2.0zm14.234375 -0.1875l1.453125 0.171875q-0.34375 1.28125 -1.28125 1.984375q-0.921875 0.703125 -2.359375 0.703125q-1.828125 0 -2.890625 -1.125q-1.0625 -1.125 -1.0625 -3.140625q0 -2.09375 1.078125 -3.25q1.078125 -1.15625 2.796875 -1.15625q1.65625 0 2.703125 1.140625q1.0625 1.125 1.0625 3.171875q0 0.125 0 0.375l-6.1875 0q0.078125 1.375 0.765625 2.109375q0.703125 0.71875 1.734375 0.71875q0.78125 0 1.328125 -0.40625q0.546875 -0.40625 0.859375 -1.296875zm-4.609375 -2.28125l4.625 0q-0.09375 -1.046875 -0.53125 -1.5625q-0.671875 -0.8125 -1.734375 -0.8125q-0.96875 0 -1.640625 0.65625q-0.65625 0.640625 -0.71875 1.71875zm7.8046875 4.953125l0 -11.453125l1.40625 0l0 11.453125l-1.40625 0zm9.2578125 -2.671875l1.453125 0.171875q-0.34375 1.28125 -1.28125 1.984375q-0.921875 0.703125 -2.359375 0.703125q-1.828125 0 -2.890625 -1.125q-1.0625 -1.125 -1.0625 -3.140625q0 -2.09375 1.078125 -3.25q1.078125 -1.15625 2.796875 -1.15625q1.65625 0 2.703125 1.140625q1.0625 1.125 1.0625 3.171875q0 0.125 0 0.375l-6.1875 0q0.078125 1.375 0.765625 2.109375q0.703125 0.71875 1.734375 0.71875q0.78125 0 1.328125 -0.40625q0.546875 -0.40625 0.859375 -1.296875zm-4.609375 -2.28125l4.625 0q-0.09375 -1.046875 -0.53125 -1.5625q-0.671875 -0.8125 -1.734375 -0.8125q-0.96875 0 -1.640625 0.65625q-0.65625 0.640625 -0.71875 1.71875zm13.2421875 1.90625l1.390625 0.1875q-0.234375 1.421875 -1.171875 2.234375q-0.921875 0.8125 -2.28125 0.8125q-1.703125 0 -2.75 -1.109375q-1.03125 -1.125 -1.03125 -3.203125q0 -1.34375 0.4375 -2.34375q0.453125 -1.015625 1.359375 -1.515625q0.921875 -0.5 1.984375 -0.5q1.359375 0 2.21875 0.6875q0.859375 0.671875 1.09375 1.9375l-1.359375 0.203125q-0.203125 -0.828125 -0.703125 -1.25q-0.484375 -0.421875 -1.1875 -0.421875q-1.0625 0 -1.734375 0.765625q-0.65625 0.75 -0.65625 2.40625q0 1.671875 0.640625 2.4375q0.640625 0.75 1.671875 0.75q0.828125 0 1.375 -0.5q0.5625 -0.515625 0.703125 -1.578125zm5.65625 1.78125l0.203125 1.25q-0.59375 0.125 -1.0625 0.125q-0.765625 0 -1.1875 -0.234375q-0.421875 -0.25 -0.59375 -0.640625q-0.171875 -0.40625 -0.171875 -1.671875l0 -4.765625l-1.03125 0l0 -1.09375l1.03125 0l0 -2.0625l1.40625 -0.84375l0 2.90625l1.40625 0l0 1.09375l-1.40625 0l0 4.84375q0 0.609375 0.0625 0.78125q0.078125 0.171875 0.25 0.28125q0.171875 0.09375 0.484375 0.09375q0.234375 0 0.609375 -0.0625zm1.3828125 -8.578125l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm3.0234375 -4.15625q0 -2.296875 1.28125 -3.40625q1.078125 -0.921875 2.609375 -0.921875q1.71875 0 2.796875 1.125q1.09375 1.109375 1.09375 3.09375q0 1.59375 -0.484375 2.515625q-0.484375 0.921875 -1.40625 1.4375q-0.90625 0.5 -2.0 0.5q-1.734375 0 -2.8125 -1.109375q-1.078125 -1.125 -1.078125 -3.234375zm1.453125 0q0 1.59375 0.6875 2.390625q0.703125 0.796875 1.75 0.796875q1.046875 0 1.734375 -0.796875q0.703125 -0.796875 0.703125 -2.4375q0 -1.53125 -0.703125 -2.328125q-0.6875 -0.796875 -1.734375 -0.796875q-1.046875 0 -1.75 0.796875q-0.6875 0.78125 -0.6875 2.375zm7.9765625 4.15625l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0z" fill-rule="nonzero"/><path fill="#a64d79" d="m540.3784 532.9129l0 0c0 -4.630005 3.753357 -8.383423 8.383362 -8.383423l131.54822 0c2.2233887 0 4.355774 0.8833008 5.9279175 2.4554443c1.5722046 1.5722046 2.4554443 3.7045288 2.4554443 5.9279785l0 33.53247c0 4.630005 -3.753357 8.383362 -8.383362 8.383362l-131.54822 0c-4.630005 0 -8.383362 -3.753357 -8.383362 -8.383362z" fill-rule="evenodd"/><path fill="#ffffff" d="m579.12964 546.0391l0 -11.453125l1.515625 0l0 11.453125l-1.515625 0zm4.0078125 0l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0zm8.3359375 -2.484375l1.390625 -0.21875q0.109375 0.84375 0.640625 1.296875q0.546875 0.4375 1.5 0.4375q0.96875 0 1.4375 -0.390625q0.46875 -0.40625 0.46875 -0.9375q0 -0.46875 -0.40625 -0.75q-0.296875 -0.1875 -1.4375 -0.46875q-1.546875 -0.390625 -2.15625 -0.671875q-0.59375 -0.296875 -0.90625 -0.796875q-0.296875 -0.5 -0.296875 -1.109375q0 -0.5625 0.25 -1.03125q0.25 -0.46875 0.6875 -0.78125q0.328125 -0.25 0.890625 -0.40625q0.578125 -0.171875 1.21875 -0.171875q0.984375 0 1.71875 0.28125q0.734375 0.28125 1.078125 0.765625q0.359375 0.46875 0.5 1.28125l-1.375 0.1875q-0.09375 -0.640625 -0.546875 -1.0q-0.453125 -0.359375 -1.265625 -0.359375q-0.96875 0 -1.390625 0.328125q-0.40625 0.3125 -0.40625 0.734375q0 0.28125 0.171875 0.5q0.171875 0.21875 0.53125 0.375q0.21875 0.078125 1.25 0.359375q1.484375 0.390625 2.078125 0.65625q0.59375 0.25 0.921875 0.734375q0.34375 0.484375 0.34375 1.203125q0 0.703125 -0.421875 1.328125q-0.40625 0.609375 -1.1875 0.953125q-0.765625 0.34375 -1.734375 0.34375q-1.625 0 -2.46875 -0.671875q-0.84375 -0.671875 -1.078125 -2.0zm11.625 1.21875l0.203125 1.25q-0.59375 0.125 -1.0625 0.125q-0.765625 0 -1.1875 -0.234375q-0.421875 -0.25 -0.59375 -0.640625q-0.171875 -0.40625 -0.171875 -1.671875l0 -4.765625l-1.03125 0l0 -1.09375l1.03125 0l0 -2.0625l1.40625 -0.84375l0 2.90625l1.40625 0l0 1.09375l-1.40625 0l0 4.84375q0 0.609375 0.0625 0.78125q0.078125 0.171875 0.25 0.28125q0.171875 0.09375 0.484375 0.09375q0.234375 0 0.609375 -0.0625zm1.3671875 1.265625l0 -8.296875l1.265625 0l0 1.25q0.484375 -0.875 0.890625 -1.15625q0.40625 -0.28125 0.90625 -0.28125q0.703125 0 1.4375 0.453125l-0.484375 1.296875q-0.515625 -0.296875 -1.03125 -0.296875q-0.453125 0 -0.828125 0.28125q-0.359375 0.265625 -0.515625 0.765625q-0.234375 0.75 -0.234375 1.640625l0 4.34375l-1.40625 0zm10.78125 0l0 -1.21875q-0.96875 1.40625 -2.640625 1.40625q-0.734375 0 -1.375 -0.28125q-0.625 -0.28125 -0.9375 -0.703125q-0.3125 -0.4375 -0.4375 -1.046875q-0.078125 -0.421875 -0.078125 -1.3125l0 -5.140625l1.40625 0l0 4.59375q0 1.109375 0.078125 1.484375q0.140625 0.5625 0.5625 0.875q0.4375 0.3125 1.0625 0.3125q0.640625 0 1.1875 -0.3125q0.5625 -0.328125 0.78125 -0.890625q0.234375 -0.5625 0.234375 -1.625l0 -4.4375l1.40625 0l0 8.296875l-1.25 0zm8.8671875 -3.046875l1.390625 0.1875q-0.234375 1.421875 -1.171875 2.234375q-0.921875 0.8125 -2.28125 0.8125q-1.703125 0 -2.75 -1.109375q-1.03125 -1.125 -1.03125 -3.203125q0 -1.34375 0.4375 -2.34375q0.453125 -1.015625 1.359375 -1.515625q0.921875 -0.5 1.984375 -0.5q1.359375 0 2.21875 0.6875q0.859375 0.671875 1.09375 1.9375l-1.359375 0.203125q-0.203125 -0.828125 -0.703125 -1.25q-0.484375 -0.421875 -1.1875 -0.421875q-1.0625 0 -1.734375 0.765625q-0.65625 0.75 -0.65625 2.40625q0 1.671875 0.640625 2.4375q0.640625 0.75 1.671875 0.75q0.828125 0 1.375 -0.5q0.5625 -0.515625 0.703125 -1.578125zm5.65625 1.78125l0.203125 1.25q-0.59375 0.125 -1.0625 0.125q-0.765625 0 -1.1875 -0.234375q-0.421875 -0.25 -0.59375 -0.640625q-0.171875 -0.40625 -0.171875 -1.671875l0 -4.765625l-1.03125 0l0 -1.09375l1.03125 0l0 -2.0625l1.40625 -0.84375l0 2.90625l1.40625 0l0 1.09375l-1.40625 0l0 4.84375q0 0.609375 0.0625 0.78125q0.078125 0.171875 0.25 0.28125q0.171875 0.09375 0.484375 0.09375q0.234375 0 0.609375 -0.0625zm1.3828125 -8.578125l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm3.0234375 -4.15625q0 -2.296875 1.28125 -3.40625q1.078125 -0.921875 2.609375 -0.921875q1.71875 0 2.796875 1.125q1.09375 1.109375 1.09375 3.09375q0 1.59375 -0.484375 2.515625q-0.484375 0.921875 -1.40625 1.4375q-0.90625 0.5 -2.0 0.5q-1.734375 0 -2.8125 -1.109375q-1.078125 -1.125 -1.078125 -3.234375zm1.453125 0q0 1.59375 0.6875 2.390625q0.703125 0.796875 1.75 0.796875q1.046875 0 1.734375 -0.796875q0.703125 -0.796875 0.703125 -2.4375q0 -1.53125 -0.703125 -2.328125q-0.6875 -0.796875 -1.734375 -0.796875q-1.046875 0 -1.75 0.796875q-0.6875 0.78125 -0.6875 2.375zm7.9765625 4.15625l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0z" fill-rule="nonzero"/><path fill="#ffffff" d="m583.4617 562.55475l1.390625 -0.21875q0.109375 0.84375 0.640625 1.296875q0.546875 0.4375 1.5 0.4375q0.96875 0 1.4375 -0.390625q0.46875 -0.40625 0.46875 -0.9375q0 -0.46875 -0.40625 -0.75q-0.296875 -0.1875 -1.4375 -0.46875q-1.546875 -0.390625 -2.15625 -0.671875q-0.59375 -0.296875 -0.90625 -0.796875q-0.296875 -0.5 -0.296875 -1.109375q0 -0.5625 0.25 -1.03125q0.25 -0.46875 0.6875 -0.78125q0.328125 -0.25 0.890625 -0.40625q0.578125 -0.171875 1.21875 -0.171875q0.984375 0 1.71875 0.28125q0.734375 0.28125 1.078125 0.765625q0.359375 0.46875 0.5 1.28125l-1.375 0.1875q-0.09375 -0.640625 -0.546875 -1.0q-0.453125 -0.359375 -1.265625 -0.359375q-0.96875 0 -1.390625 0.328125q-0.40625 0.3125 -0.40625 0.734375q0 0.28125 0.171875 0.5q0.171875 0.21875 0.53125 0.375q0.21875 0.078125 1.25 0.359375q1.484375 0.390625 2.078125 0.65625q0.59375 0.25 0.921875 0.734375q0.34375 0.484375 0.34375 1.203125q0 0.703125 -0.421875 1.328125q-0.40625 0.609375 -1.1875 0.953125q-0.765625 0.34375 -1.734375 0.34375q-1.625 0 -2.46875 -0.671875q-0.84375 -0.671875 -1.078125 -2.0zm14.234375 -0.1875l1.453125 0.171875q-0.34375 1.28125 -1.28125 1.984375q-0.921875 0.703125 -2.359375 0.703125q-1.828125 0 -2.890625 -1.125q-1.0625 -1.125 -1.0625 -3.140625q0 -2.09375 1.078125 -3.25q1.078125 -1.15625 2.796875 -1.15625q1.65625 0 2.703125 1.140625q1.0625 1.125 1.0625 3.171875q0 0.125 0 0.375l-6.1875 0q0.078125 1.375 0.765625 2.109375q0.703125 0.71875 1.734375 0.71875q0.78125 0 1.328125 -0.40625q0.546875 -0.40625 0.859375 -1.296875zm-4.609375 -2.28125l4.625 0q-0.09375 -1.046875 -0.53125 -1.5625q-0.671875 -0.8125 -1.734375 -0.8125q-0.96875 0 -1.640625 0.65625q-0.65625 0.640625 -0.71875 1.71875zm7.8046875 4.953125l0 -11.453125l1.40625 0l0 11.453125l-1.40625 0zm9.2578125 -2.671875l1.453125 0.171875q-0.34375 1.28125 -1.28125 1.984375q-0.921875 0.703125 -2.359375 0.703125q-1.828125 0 -2.890625 -1.125q-1.0625 -1.125 -1.0625 -3.140625q0 -2.09375 1.078125 -3.25q1.078125 -1.15625 2.796875 -1.15625q1.65625 0 2.703125 1.140625q1.0625 1.125 1.0625 3.171875q0 0.125 0 0.375l-6.1875 0q0.078125 1.375 0.765625 2.109375q0.703125 0.71875 1.734375 0.71875q0.78125 0 1.328125 -0.40625q0.546875 -0.40625 0.859375 -1.296875zm-4.609375 -2.28125l4.625 0q-0.09375 -1.046875 -0.53125 -1.5625q-0.671875 -0.8125 -1.734375 -0.8125q-0.96875 0 -1.640625 0.65625q-0.65625 0.640625 -0.71875 1.71875zm13.2421875 1.90625l1.390625 0.1875q-0.234375 1.421875 -1.171875 2.234375q-0.921875 0.8125 -2.28125 0.8125q-1.703125 0 -2.75 -1.109375q-1.03125 -1.125 -1.03125 -3.203125q0 -1.34375 0.4375 -2.34375q0.453125 -1.015625 1.359375 -1.515625q0.921875 -0.5 1.984375 -0.5q1.359375 0 2.21875 0.6875q0.859375 0.671875 1.09375 1.9375l-1.359375 0.203125q-0.203125 -0.828125 -0.703125 -1.25q-0.484375 -0.421875 -1.1875 -0.421875q-1.0625 0 -1.734375 0.765625q-0.65625 0.75 -0.65625 2.40625q0 1.671875 0.640625 2.4375q0.640625 0.75 1.671875 0.75q0.828125 0 1.375 -0.5q0.5625 -0.515625 0.703125 -1.578125zm5.65625 1.78125l0.203125 1.25q-0.59375 0.125 -1.0625 0.125q-0.765625 0 -1.1875 -0.234375q-0.421875 -0.25 -0.59375 -0.640625q-0.171875 -0.40625 -0.171875 -1.671875l0 -4.765625l-1.03125 0l0 -1.09375l1.03125 0l0 -2.0625l1.40625 -0.84375l0 2.90625l1.40625 0l0 1.09375l-1.40625 0l0 4.84375q0 0.609375 0.0625 0.78125q0.078125 0.171875 0.25 0.28125q0.171875 0.09375 0.484375 0.09375q0.234375 0 0.609375 -0.0625zm1.3828125 -8.578125l0 -1.609375l1.40625 0l0 1.609375l-1.40625 0zm0 9.84375l0 -8.296875l1.40625 0l0 8.296875l-1.40625 0zm3.0234375 -4.15625q0 -2.296875 1.28125 -3.40625q1.078125 -0.921875 2.609375 -0.921875q1.71875 0 2.796875 1.125q1.09375 1.109375 1.09375 3.09375q0 1.59375 -0.484375 2.515625q-0.484375 0.921875 -1.40625 1.4375q-0.90625 0.5 -2.0 0.5q-1.734375 0 -2.8125 -1.109375q-1.078125 -1.125 -1.078125 -3.234375zm1.453125 0q0 1.59375 0.6875 2.390625q0.703125 0.796875 1.75 0.796875q1.046875 0 1.734375 -0.796875q0.703125 -0.796875 0.703125 -2.4375q0 -1.53125 -0.703125 -2.328125q-0.6875 -0.796875 -1.734375 -0.796875q-1.046875 0 -1.75 0.796875q-0.6875 0.78125 -0.6875 2.375zm7.9765625 4.15625l0 -8.296875l1.265625 0l0 1.171875q0.90625 -1.359375 2.640625 -1.359375q0.75 0 1.375 0.265625q0.625 0.265625 0.9375 0.703125q0.3125 0.4375 0.4375 1.046875q0.078125 0.390625 0.078125 1.359375l0 5.109375l-1.40625 0l0 -5.046875q0 -0.859375 -0.171875 -1.28125q-0.15625 -0.4375 -0.578125 -0.6875q-0.40625 -0.25 -0.96875 -0.25q-0.90625 0 -1.5625 0.578125q-0.640625 0.5625 -0.640625 2.15625l0 4.53125l-1.40625 0z" fill-rule="nonzero"/><path fill="#a64d79" d="m540.3784 603.1762l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l131.54822 0c2.2233887 0 4.355774 0.88323975 5.9279175 2.4554443c1.5722046 1.5722046 2.4554443 3.7045288 2.4554443 5.9279175l0 33.53247c0 4.630005 -3.753357 8.383423 -8.383362 8.383423l-131.54822 0c-4.630005 0 -8.383362 -3.753418 -8.383362 -8.383423z" fill-rule="evenodd"/><path fill="#ffffff" d="m609.32104 625.8025l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0zm4.4453125 0l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0zm4.4453125 0l0 -1.609375l1.609375 0l0 1.609375l-1.609375 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m689.88116 252.29396c8.189758 0 12.285095 43.65355 16.379578 87.30708c4.0944824 43.653564 8.188171 87.3071 16.376282 87.3071" fill-rule="evenodd"/><path stroke="#3f3f3f" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m689.88116 252.29396c8.189758 0 12.285095 43.65355 16.379578 87.30708c2.0472412 21.826782 4.0942383 43.653564 6.653076 60.02362c0.63964844 4.0925293 1.3113403 7.843994 2.0229492 11.169189c0.35577393 1.6625977 0.7216797 3.218567 1.0985107 4.6573486c0.094177246 0.3597107 0.18902588 0.71206665 0.28466797 1.0569458l0.04724121 0.16818237" fill-rule="evenodd"/><path fill="#3f3f3f" stroke="#3f3f3f" stroke-width="2.0" stroke-linecap="butt" d="m713.55054 418.40237l7.558899 6.012787l-1.9254761 -9.464813z" fill-rule="evenodd"/><path fill="#254a89" d="m152.4042 148.53035l0 0c0 -4.630005 3.753357 -8.383362 8.383362 -8.383362l553.4695 0c2.2233887 0 4.355713 0.88323975 5.9279175 2.455429c1.5722046 1.5721893 2.4554443 3.7045288 2.4554443 5.9279327l0 33.53247c0 4.630005 -3.753357 8.383377 -8.383362 8.383377l-553.4695 0c-4.630005 0 -8.383362 -3.7533722 -8.383362 -8.383377z" fill-rule="evenodd"/><path fill="#ffffff" d="m421.15555 159.70346l1.96875 0l3.640625 9.390625l0.0625 0l3.65625 -9.390625l1.96875 0l0 11.453125l-1.34375 0l0 -7.28125l0.0625 -2.171875l-0.0625 0l-3.75 9.453125l-1.109375 0l-3.75 -9.453125l-0.0625 0l0.0625 2.171875l0 7.28125l-1.34375 0l0 -11.453125zm13.856049 0l1.359375 0l0 10.15625l5.0 0l0 1.296875l-6.359375 0l0 -11.453125zm8.143982 0l1.359375 0l0 11.453125l-1.359375 0l0 -11.453125zm3.920044 0l3.875 0q0.9375 0 1.75 0.4375q0.828125 0.421875 1.3125 1.203125q0.484375 0.765625 0.484375 1.75q0 0.796875 -0.359375 1.484375q-0.359375 0.6875 -1.0 1.15625q-0.625 0.46875 -1.390625 0.625l-0.03125 0.046875l3.234375 4.6875l0 0.0625l-1.609375 0l-3.109375 -4.671875l-1.796875 0l0 4.671875l-1.359375 0l0 -11.453125zm3.796875 5.515625q0.578125 0 1.09375 -0.265625q0.53125 -0.28125 0.84375 -0.765625q0.3125 -0.484375 0.3125 -1.09375q0 -0.53125 -0.28125 -1.0q-0.265625 -0.484375 -0.75 -0.78125q-0.484375 -0.3125 -1.125 -0.3125l-2.53125 0l0 4.21875l2.4375 0z" fill-rule="nonzero"/><path fill="#741b47" d="m843.1496 343.00012l0 0c0 -3.3804626 2.7404175 -6.1208496 6.1208496 -6.1208496l100.010254 0c1.623352 0 3.1802368 0.64486694 4.328125 1.7927551c1.1478882 1.1478882 1.7927246 2.7047424 1.7927246 4.3280945l0 24.482697c0 3.3804626 -2.7403564 6.1208496 -6.1208496 6.1208496l-100.010254 0c-3.3804321 0 -6.1208496 -2.740387 -6.1208496 -6.1208496z" fill-rule="evenodd"/><path fill="#ffffff" d="m889.5958 361.35147q-1.609375 0 -2.984375 -0.78125q-1.359375 -0.78125 -2.15625 -2.15625q-0.796875 -1.375 -0.796875 -3.046875q0 -1.65625 0.796875 -3.03125q0.796875 -1.375 2.15625 -2.15625q1.375 -0.796875 2.984375 -0.796875q1.265625 0 2.375 0.453125q1.109375 0.453125 1.828125 1.265625l-0.953125 0.96875q-0.5625 -0.671875 -1.421875 -1.03125q-0.84375 -0.359375 -1.8125 -0.359375q-1.203125 0 -2.265625 0.59375q-1.046875 0.578125 -1.6875 1.640625q-0.625 1.0625 -0.625 2.453125q0 1.390625 0.625 2.46875q0.640625 1.0625 1.6875 1.640625q1.0625 0.578125 2.265625 0.578125q1.15625 0 1.921875 -0.359375q0.78125 -0.359375 1.34375 -0.953125q0.421875 -0.4375 0.671875 -1.0625q0.25 -0.640625 0.3125 -1.40625l-4.234375 0l0 -1.265625l5.5 0q0.078125 0.453125 0.078125 0.828125q0 1.0625 -0.34375 2.0625q-0.328125 1.0 -1.0625 1.75q-1.578125 1.703125 -4.203125 1.703125zm7.6329956 -11.703125l3.859375 0q0.953125 0 1.765625 0.4375q0.828125 0.421875 1.3125 1.203125q0.484375 0.765625 0.484375 1.75q0 0.96875 -0.484375 1.75q-0.484375 0.78125 -1.3125 1.21875q-0.8125 0.421875 -1.765625 0.421875l-2.5 0l0 4.671875l-1.359375 0l0 -11.453125zm3.890625 5.484375q0.640625 0 1.125 -0.296875q0.484375 -0.3125 0.75 -0.78125q0.28125 -0.484375 0.28125 -1.015625q0 -0.53125 -0.28125 -1.0q-0.265625 -0.484375 -0.75 -0.78125q-0.484375 -0.3125 -1.125 -0.3125l-2.53125 0l0 4.1875l2.53125 0zm9.262878 6.21875q-1.234375 0 -2.203125 -0.5625q-0.953125 -0.5625 -1.484375 -1.578125q-0.515625 -1.015625 -0.515625 -2.34375l0 -7.21875l1.359375 0l0 7.28125q0 1.390625 0.734375 2.265625q0.734375 0.859375 2.109375 0.859375q1.375 0 2.109375 -0.859375q0.75 -0.875 0.75 -2.265625l0 -7.28125l1.359375 0l0 7.21875q0 1.328125 -0.5 2.359375q-0.5 1.015625 -1.46875 1.578125q-0.953125 0.546875 -2.25 0.546875z" fill-rule="nonzero"/><path fill="#741b47" d="m843.1496 391.00012l0 0c0 -3.3804626 2.7404175 -6.1208496 6.1208496 -6.1208496l100.010254 0c1.623352 0 3.1802368 0.64486694 4.328125 1.7927551c1.1478882 1.1478882 1.7927246 2.7047424 1.7927246 4.3280945l0 24.482697c0 3.3804626 -2.7403564 6.1208496 -6.1208496 6.1208496l-100.010254 0c-3.3804321 0 -6.1208496 -2.740387 -6.1208496 -6.1208496z" fill-rule="evenodd"/><path fill="#ffffff" d="m879.9968 397.64835l6.625 0l0 1.296875l-5.265625 0l0 3.921875l4.75 0l0 1.28125l-4.75 0l0 4.953125l-1.359375 0l0 -11.453125zm8.495972 0l3.859375 0q0.953125 0 1.765625 0.4375q0.828125 0.421875 1.3125 1.203125q0.484375 0.765625 0.484375 1.75q0 0.96875 -0.484375 1.75q-0.484375 0.78125 -1.3125 1.21875q-0.8125 0.421875 -1.765625 0.421875l-2.5 0l0 4.671875l-1.359375 0l0 -11.453125zm3.890625 5.484375q0.640625 0 1.125 -0.296875q0.484375 -0.3125 0.75 -0.78125q0.28125 -0.484375 0.28125 -1.015625q0 -0.53125 -0.28125 -1.0q-0.265625 -0.484375 -0.75 -0.78125q-0.484375 -0.3125 -1.125 -0.3125l-2.53125 0l0 4.1875l2.53125 0zm10.540405 6.21875q-1.609375 0 -2.984375 -0.78125q-1.359375 -0.78125 -2.15625 -2.15625q-0.796875 -1.375 -0.796875 -3.046875q0 -1.65625 0.796875 -3.03125q0.796875 -1.375 2.15625 -2.15625q1.375 -0.796875 2.984375 -0.796875q1.265625 0 2.375 0.453125q1.109375 0.453125 1.828125 1.265625l-0.953125 0.96875q-0.5625 -0.671875 -1.421875 -1.03125q-0.84375 -0.359375 -1.8125 -0.359375q-1.203125 0 -2.265625 0.59375q-1.046875 0.578125 -1.6875 1.640625q-0.625 1.0625 -0.625 2.453125q0 1.390625 0.625 2.46875q0.640625 1.0625 1.6875 1.640625q1.0625 0.578125 2.265625 0.578125q1.15625 0 1.921875 -0.359375q0.78125 -0.359375 1.34375 -0.953125q0.421875 -0.4375 0.671875 -1.0625q0.25 -0.640625 0.3125 -1.40625l-4.234375 0l0 -1.265625l5.5 0q0.078125 0.453125 0.078125 0.828125q0 1.0625 -0.34375 2.0625q-0.328125 1.0 -1.0625 1.75q-1.578125 1.703125 -4.203125 1.703125zm10.785522 -11.703125l1.53125 0l4.359375 11.453125l-1.5 0l-1.140625 -3.15625l-4.96875 0l-1.140625 3.15625l-1.5 0l4.359375 -11.453125zm2.78125 7.015625l-1.53125 -4.125l-0.453125 -1.234375l-0.0625 0l-0.453125 1.234375l-1.53125 4.125l4.03125 0z" fill-rule="nonzero"/><path fill="#741b47" d="m843.1496 447.00012l0 0c0 -3.3804626 2.7404175 -6.1208496 6.1208496 -6.1208496l100.010254 0c1.623352 0 3.1802368 0.64486694 4.328125 1.7927551c1.1478882 1.1478882 1.7927246 2.7047424 1.7927246 4.3280945l0 24.482697c0 3.3804626 -2.7403564 6.1208496 -6.1208496 6.1208496l-100.010254 0c-3.3804321 0 -6.1208496 -2.740387 -6.1208496 -6.1208496z" fill-rule="evenodd"/><path fill="#ffffff" d="m895.46985 465.19522q-0.4375 0 -0.734375 -0.296875q-0.296875 -0.296875 -0.296875 -0.71875q0 -0.421875 0.296875 -0.71875q0.296875 -0.296875 0.734375 -0.296875q0.4375 0 0.71875 0.296875q0.296875 0.296875 0.296875 0.71875q0 0.421875 -0.296875 0.71875q-0.28125 0.296875 -0.71875 0.296875zm3.8079834 0q-0.4375 0 -0.734375 -0.296875q-0.296875 -0.296875 -0.296875 -0.71875q0 -0.421875 0.296875 -0.71875q0.296875 -0.296875 0.734375 -0.296875q0.4375 0 0.71875 0.296875q0.296875 0.296875 0.296875 0.71875q0 0.421875 -0.296875 0.71875q-0.28125 0.296875 -0.71875 0.296875zm3.8080444 0q-0.4375 0 -0.734375 -0.296875q-0.296875 -0.296875 -0.296875 -0.71875q0 -0.421875 0.296875 -0.71875q0.296875 -0.296875 0.734375 -0.296875q0.4375 0 0.71875 0.296875q0.296875 0.296875 0.296875 0.71875q0 0.421875 -0.296875 0.71875q-0.28125 0.296875 -0.71875 0.296875z" fill-rule="nonzero"/><path fill="#741b47" d="m843.1496 503.00012l0 0c0 -3.3804626 2.7404175 -6.1208496 6.1208496 -6.1208496l100.010254 0c1.623352 0 3.1802368 0.64486694 4.328125 1.7927551c1.1478882 1.1478882 1.7927246 2.7047424 1.7927246 4.3280945l0 24.482666c0 3.3804932 -2.7403564 6.1209106 -6.1208496 6.1209106l-100.010254 0c-3.3804321 0 -6.1208496 -2.7404175 -6.1208496 -6.1209106z" fill-rule="evenodd"/><path fill="#ffffff" d="m888.7646 510.94522l-3.203125 0l0 -1.296875l7.765625 0l0 1.296875l-3.203125 0l0 10.1562195l-1.359375 0l0 -10.1562195zm6.264221 -1.296875l3.859375 0q0.953125 0 1.765625 0.4375q0.828125 0.421875 1.3125 1.203125q0.484375 0.7655945 0.484375 1.7499695q0 0.96875 -0.484375 1.75q-0.484375 0.78125 -1.3125 1.21875q-0.8125 0.421875 -1.765625 0.421875l-2.5 0l0 4.671875l-1.359375 0l0 -11.4530945zm3.890625 5.4843445q0.640625 0 1.125 -0.296875q0.484375 -0.3125 0.75 -0.78125q0.28125 -0.484375 0.28125 -1.015625q0 -0.53125 -0.28125 -1.0q-0.265625 -0.48434448 -0.75 -0.7812195q-0.484375 -0.3125 -1.125 -0.3125l-2.53125 0l0 4.1874695l2.53125 0zm9.262939 6.21875q-1.234375 0 -2.203125 -0.5625q-0.953125 -0.5625 -1.484375 -1.578125q-0.515625 -1.015625 -0.515625 -2.34375l0 -7.2187195l1.359375 0l0 7.2812195q0 1.390625 0.734375 2.265625q0.734375 0.859375 2.109375 0.859375q1.375 0 2.109375 -0.859375q0.75 -0.875 0.75 -2.265625l0 -7.2812195l1.359375 0l0 7.2187195q0 1.328125 -0.5 2.359375q-0.5 1.015625 -1.46875 1.578125q-0.953125 0.546875 -2.25 0.546875z" fill-rule="nonzero"/></g></svg>
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/g3doc/overview.md b/tensorflow/compiler/mlir/g3doc/overview.md
new file mode 100644
index 00000000000..4cf99ba3800
--- /dev/null
+++ b/tensorflow/compiler/mlir/g3doc/overview.md
@@ -0,0 +1,36 @@
+# MLIR
+
+## Overview
+
+MLIR, or Multi-Level Intermediate Representation, is a representation format
+and library of compiler utilities that sits between the model representation
+and low-level compilers/executors that generate hardware-specific code.
+
+MLIR is, at its heart, a flexible infrastructure for modern optimizing
+compilers. This means it consists of a specification for intermediate
+representations (IR) and a code toolkit to perform transformations on that
+representation. (In compiler parlance, as you move from higher-level
+representations to lower-level representations, these transformations can be
+called “lowerings”)
+
+MLIR is highly influenced by [LLVM](https://llvm.org/) and unabashedly reuses
+many great ideas from it. It has a flexible type system, and allows
+representing, analyzing and transforming graphs combining multiple levels of
+abstraction in the same compilation unit. These abstractions include TensorFlow
+operations, nested polyhedral loop regions, and even LLVM instructions and fixed
+hardware operations and types.
+
+We expect MLIR to be of interest to many groups, including:
+
+*   Compiler researchers and implementers looking to optimize performance and
+    memory consumption of machine learning models
+*   Hardware makers looking for a way to connect their hardware to TensorFlow,
+    such as TPUs, portable neural hardware in phones, and other custom ASICs
+*   People writing language bindings that want to take advantage of optimizing
+    compilers and hardware acceleration.
+
+The TensorFlow ecosystem contains a number of compilers and optimizers that
+operate at multiple levels of the software and hardware stack. We expect the
+gradual adoption of MLIR to simplify every aspect of this stack.
+
+<img alt="MLIR overview diagram" src="./images/mlir-infra.svg"/>

From c9de7258f7557edcfc16cd3cd160284bf70ecdb0 Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Thu, 20 Feb 2020 17:44:04 -0800
Subject: [PATCH 417/442] Set shard count in the input test to prevent timeouts

PiperOrigin-RevId: 296336666
Change-Id: I04bc3e046f18cd0a72c891d65be34298bf16c202
---
 tensorflow/python/distribute/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 461365b4b45..e27289e6bfa 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -946,6 +946,7 @@ distribute_py_test(
     name = "custom_training_loop_input_test",
     srcs = ["custom_training_loop_input_test.py"],
     main = "custom_training_loop_input_test.py",
+    shard_count = 5,
     tags = [
         "multi_and_single_gpu",
     ],

From d871085b8683c9739359b0814615f94e4486794d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 20 Feb 2020 17:59:20 -0800
Subject: [PATCH 418/442] Update ops-related pbtxt files.

PiperOrigin-RevId: 296339143
Change-Id: Ie9f9b914c9f3b660eafc9fad080d7935ee0466b1
---
 .../QuantizeAndDequantizeV2Grad.pbtxt         | 50 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 50 +++++++++++++++++++
 2 files changed, 100 insertions(+)
 create mode 100644 tensorflow/core/ops/compat/ops_history_v1/QuantizeAndDequantizeV2Grad.pbtxt

diff --git a/tensorflow/core/ops/compat/ops_history_v1/QuantizeAndDequantizeV2Grad.pbtxt b/tensorflow/core/ops/compat/ops_history_v1/QuantizeAndDequantizeV2Grad.pbtxt
new file mode 100644
index 00000000000..c1355f10390
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v1/QuantizeAndDequantizeV2Grad.pbtxt
@@ -0,0 +1,50 @@
+op {
+  name: "QuantizeAndDequantizeV2Grad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_min_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_max_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 526a1bfb46c..781fa72743c 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -28378,6 +28378,56 @@ op {
     }
   }
 }
+op {
+  name: "QuantizeAndDequantizeV2Grad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_min_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_max_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
 op {
   name: "QuantizeAndDequantizeV3"
   input_arg {

From 06db91b9dd68b086d7175734e7369992f894d493 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Thu, 20 Feb 2020 18:00:10 -0800
Subject: [PATCH 419/442] Enable TFLite experimental new converter by default.

PiperOrigin-RevId: 296339261
Change-Id: Ibae109e8ebc3dae196e144b6ec7a740d4b7c82fd
---
 tensorflow/lite/python/lite.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 3965a4ac275..6aee3bc0d75 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -78,7 +78,7 @@ from tensorflow.python.util.tf_export import tf_export as _tf_export
 
 
 # The default value of `experimental_new_converter`.
-_USE_EXPERIMENTAL_NEW_CONVERTER = False
+_USE_EXPERIMENTAL_NEW_CONVERTER = True
 
 
 @_tf_export("lite.Optimize")

From 835ac7291dd62277e27d1a66e241608b98790bb3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 20 Feb 2020 18:01:00 -0800
Subject: [PATCH 420/442] Internal change

PiperOrigin-RevId: 296339357
Change-Id: Ife4d6cc532586e15b94c049786977c4a7acf597d
---
 tensorflow/python/keras/engine/network.py     | 87 +++++--------------
 .../python/keras/engine/network_test.py       | 38 --------
 tensorflow/python/keras/saving/save.py        | 22 +++--
 3 files changed, 32 insertions(+), 115 deletions(-)

diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 98abbed80a6..79f15d9f3ae 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -1063,7 +1063,28 @@ class Network(base_layer.Layer):
         ValueError: For invalid/unknown format arguments.
     """
     self._assert_weights_created()
-    save_format = validate_save_format(filepath, save_format)
+    filepath_is_h5 = _is_hdf5_filepath(filepath)
+    if save_format is None:
+      if filepath_is_h5:
+        save_format = 'h5'
+      else:
+        save_format = 'tf'
+    else:
+      user_format = save_format.lower().strip()
+      if user_format in ('tensorflow', 'tf'):
+        save_format = 'tf'
+      elif user_format in ('hdf5', 'h5', 'keras'):
+        save_format = 'h5'
+      else:
+        raise ValueError(
+            'Unknown format "%s". Was expecting one of {"tf", "h5"}.' % (
+                save_format,))
+    if save_format == 'tf' and filepath_is_h5:
+      raise ValueError(
+          ('save_weights got save_format="tf"/"tensorflow", but the '
+           'filepath ("%s") looks like an HDF5 file. Omit the ".h5"/".keras" '
+           'when saving in TensorFlow format.')
+          % filepath)
 
     if save_format == 'h5' and h5py is None:
       raise ImportError(
@@ -2086,67 +2107,3 @@ def get_network_config(network, serialize_layer_fn=None):
   model_outputs = tf_utils.convert_inner_node_data(model_outputs)
   config['output_layers'] = model_outputs
   return config
-
-
-def validate_save_format(filepath, save_format, default='tf'):
-  """Validates `save_format` argument passed to methods used for saving.
-
-  Returns either 'tf' or 'h5', indicating whether to save the model
-  to Tensorflow SavedModel or HDF5. Output will default to 'tf' in TF2.X and
-  'h5' in TF1.X.
-
-  Defaults to 'h5' if `filepath` is a path to a hdf5 file (having suffix '.h5'
-  or '.hdf5' or '.keras') or is an h5py.File object.
-
-  Args:
-    filepath: Value of the `filepath` argument passed to the method.
-      Can be: - String - h5py.File object
-    save_format: String, value of the 'save_format' argument as passed.
-    default: Default format if save_format isn't specified and the filepath
-      doesn't indicate that the format is 'h5'.
-
-  Returns:
-    save_format: String, 'h5' or 'tf'. The processed
-    value of the `save_format` argument.
-
-  Raises:
-    ValueError: If
-      - `filepath` is not a String or an h5py.File object.
-      - `save_format` is not valid. Valid values are "tensorflow", "tf" for
-        saving in SavedModel format, and "hdf5", "keras" or "h5" for saving in
-        h5 format.
-      - `save_format` is "tf" but `filepath` is a path to a h5 file.
-      - `save_format` is "tf" but `filepath` is an h5py.File object.
-  """
-  if not isinstance(filepath, (str, h5py.File)):
-    raise ValueError(
-        'Expected `filepath` to be a String or h5py.File object. Got '
-        'unsupported value %s of type %s' % (filepath, type(filepath)))
-
-  filepath_is_h5py_file = h5py is not None and isinstance(filepath, h5py.File)
-  filepath_is_h5 = isinstance(filepath, str) and _is_hdf5_filepath(filepath)
-  if save_format is None:
-    if filepath_is_h5 or filepath_is_h5py_file:
-      save_format = 'h5'
-    else:
-      save_format = default
-  else:
-    user_format = save_format.lower().strip()
-    if user_format in ('tensorflow', 'tf'):
-      save_format = 'tf'
-    elif user_format in ('hdf5', 'h5', 'keras'):
-      save_format = 'h5'
-    else:
-      raise ValueError(
-          'Unknown format "%s". Was expecting one of {"tf", "h5"}.' %
-          (save_format))
-  if save_format == 'tf' and filepath_is_h5:
-    raise ValueError(
-        ('Got save_format="tf"/"tensorflow", but the filepath ("%s") looks '
-         'like an HDF5 file. Omit the ".h5"/".keras" when saving in '
-         'TensorFlow format.') % filepath)
-  if save_format == 'tf' and filepath_is_h5py_file:
-    raise ValueError(
-        'Got save_format="tf"/"tensorflow", but the given `filepath`'
-        'is an h5py.File object.')
-  return save_format
diff --git a/tensorflow/python/keras/engine/network_test.py b/tensorflow/python/keras/engine/network_test.py
index 493f6f02867..17f08889936 100644
--- a/tensorflow/python/keras/engine/network_test.py
+++ b/tensorflow/python/keras/engine/network_test.py
@@ -1880,43 +1880,5 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
       self.assertEqual(network.compute_output_shape((1, i, 32)), (1, i, 2))
 
 
-class SaveFormatValidationTest(keras_parameterized.TestCase):
-
-  def test_save_format_validation(self):
-    filepath = 'file/path'
-    h5_filepath = 'h5_filepath.h5'
-    h5_filepath_2 = 'h5_filepath.hdf5'
-    h5_filepath_3 = 'h5_filepath.keras'
-
-    self.assertEqual(
-        network_lib.validate_save_format(filepath, None, 'h5'), 'h5')
-    self.assertEqual(
-        network_lib.validate_save_format(filepath, None, 'tf'), 'tf')
-
-    self.assertEqual(network_lib.validate_save_format(filepath, 'h5'), 'h5')
-    self.assertEqual(network_lib.validate_save_format(h5_filepath, None), 'h5')
-    self.assertEqual(
-        network_lib.validate_save_format(h5_filepath_2, None), 'h5')
-    self.assertEqual(
-        network_lib.validate_save_format(h5_filepath_3, None), 'h5')
-    self.assertEqual(
-        network_lib.validate_save_format(h5_filepath, 'hdf5'), 'h5')
-    self.assertEqual(
-        network_lib.validate_save_format(h5_filepath, 'keras'), 'h5')
-
-    self.assertEqual(network_lib.validate_save_format(filepath, 'tf'), 'tf')
-    self.assertEqual(
-        network_lib.validate_save_format(filepath, 'tensorflow'), 'tf')
-
-    with self.assertRaises(ValueError):
-      network_lib.validate_save_format(42, 'h5')
-
-    with self.assertRaises(ValueError):
-      network_lib.validate_save_format(filepath, 'unknown_format')
-
-    with self.assertRaises(ValueError):
-      network_lib.validate_save_format(h5_filepath, 'tf')
-
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py
index d678e14b0c4..7344e6f9f59 100644
--- a/tensorflow/python/keras/saving/save.py
+++ b/tensorflow/python/keras/saving/save.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import sys
 
 import six
@@ -28,15 +29,8 @@ from tensorflow.python.keras.saving.saved_model import load as saved_model_load
 from tensorflow.python.keras.saving.saved_model import save as saved_model_save
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.saved_model import loader_impl
-from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import keras_export
 
-# pylint: disable=g-inconsistent-quotes
-network = LazyLoader(
-    "network", globals(),
-    "tensorflow.python.keras.engine.network")
-# pylint: enable=g-inconsistent-quotes
-
 # pylint: disable=g-import-not-at-top
 if sys.version_info >= (3, 4):
   import pathlib
@@ -46,6 +40,9 @@ except ImportError:
   h5py = None
 # pylint: enable=g-import-not-at-top
 
+_HDF5_EXTENSIONS = ['.h5', '.hdf5', '.keras']
+
+
 # TODO(kathywu): Remove this when Keras SavedModel is not experimental.
 _KERAS_SAVED_MODEL_STILL_EXPERIMENTAL = True
 
@@ -115,14 +112,15 @@ def save_model(model,
   """
   from tensorflow.python.keras.engine import sequential  # pylint: disable=g-import-not-at-top
 
+  default_format = 'tf' if tf2.enabled() else 'h5'
+  save_format = save_format or default_format
+
   if sys.version_info >= (3, 4) and isinstance(filepath, pathlib.Path):
     filepath = str(filepath)
 
-  default_format = 'tf' if tf2.enabled() else 'h5'
-  save_format = network.validate_save_format(filepath, save_format,
-                                             default_format)
-
-  if save_format == 'h5':
+  if (save_format == 'h5' or
+      (h5py is not None and isinstance(filepath, h5py.File)) or
+      os.path.splitext(filepath)[1] in _HDF5_EXTENSIONS):
     # TODO(b/130258301): add utility method for detecting model type.
     if (not model._is_graph_network and  # pylint:disable=protected-access
         not isinstance(model, sequential.Sequential)):

From 0fd32f328d413673326eeed4d64469a3c21d8769 Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Thu, 20 Feb 2020 18:13:49 -0800
Subject: [PATCH 421/442] properly initialize shape_tensor in Hexagon delegate
 reshape

PiperOrigin-RevId: 296341583
Change-Id: If3b36d6941d85494ea73fa8ff4207152d6cf48d5
---
 .../experimental/delegates/hexagon/builders/reshape_builder.cc  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/reshape_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/reshape_builder.cc
index eb755729267..7a69d56b349 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/reshape_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/reshape_builder.cc
@@ -58,7 +58,7 @@ TfLiteStatus ReshapeOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   AddInput(graph_builder_->GetHexagonTensorId(inputs->data[0]));
 
   // Output shape.
-  TfLiteTensor* shape_tensor;
+  TfLiteTensor* shape_tensor = nullptr;
   bool output_shape_is_dynamic = false;
   if (inputs->size == 2) {
     shape_tensor = &context->tensors[inputs->data[1]];

From fce5012148678b35aa431d128db6987742d97cb5 Mon Sep 17 00:00:00 2001
From: Hyeonjong Ryu <xhae@google.com>
Date: Thu, 20 Feb 2020 18:22:40 -0800
Subject: [PATCH 422/442] String input support on TFLite Tile op

PiperOrigin-RevId: 296342758
Change-Id: I4498b56b6da7074f8747fab893009b0d1d0d3cc9
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   |  4 +-
 tensorflow/lite/kernels/register.cc           |  4 +-
 tensorflow/lite/kernels/tile.cc               | 70 +++++++++++++++++++
 tensorflow/lite/kernels/tile_test.cc          | 48 +++++++++++++
 tensorflow/lite/testing/op_tests/tile.py      |  2 +-
 tensorflow/lite/toco/tflite/op_version.cc     |  1 +
 .../lite/tools/versioning/op_version.cc       |  6 ++
 .../lite/tools/versioning/op_version_test.cc  | 13 ++++
 8 files changed, 144 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index a04e1d44ea6..d4127e53fa9 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -2482,11 +2482,11 @@ def TFL_TileOp: TFL_Op<"tile", [NoSideEffect, SameOperandsAndResultsScale,
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I1, I32, I64, TFL_Uint8, QUI8]>:$input,
+    TFL_TensorOf<[F32, I1, I32, I64, TFL_Uint8, QUI8, TFL_Str]>:$input,
     TFL_I32OrI64Tensor:$multiples);
 
   let results = (outs
-    TFL_TensorOf<[F32, I1, I32, I64, TFL_Uint8, QUI8]>:$output);
+    TFL_TensorOf<[F32, I1, I32, I64, TFL_Uint8, QUI8, TFL_Str]>:$output);
 
   let hasOptions = 0;
 }
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index 5e2de955983..e8eebd81025 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -210,7 +210,9 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSE_CONV(),
              /* min_version */ 1,
              /* max_version */ 2);
-  AddBuiltin(BuiltinOperator_TILE, Register_TILE());
+  AddBuiltin(BuiltinOperator_TILE, Register_TILE(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_SUM, Register_SUM(),
              /* min_version */ 1,
              /* max_version */ 2);
diff --git a/tensorflow/lite/kernels/tile.cc b/tensorflow/lite/kernels/tile.cc
index edbe711d807..64f6bd05485 100644
--- a/tensorflow/lite/kernels/tile.cc
+++ b/tensorflow/lite/kernels/tile.cc
@@ -83,6 +83,18 @@ void CopyMultipleTimes(const T* in_data, int32_t in_size, M multiplier,
   }
 }
 
+template <typename M>
+void CopyStringMultipleTimes(const TfLiteTensor* in_data, int in_data_index,
+                             const int dimension_size, M multiplier,
+                             DynamicBuffer* buffer) {
+  for (M i = 0; i < multiplier; ++i) {
+    for (int j = 0; j < dimension_size; ++j) {
+      const auto string_ref = GetString(in_data, in_data_index + j);
+      buffer->AddString(string_ref.str, string_ref.len);
+    }
+  }
+}
+
 template <typename T, typename M>
 std::pair<int, int> TileOneDimension(const TfLiteIntArray& in_dimensions,
                                      const T* in_data, const M* multipliers,
@@ -116,6 +128,38 @@ std::pair<int, int> TileOneDimension(const TfLiteIntArray& in_dimensions,
       static_cast<int>(total_tiled_stride_size * multipliers[dimension]));
 }
 
+template <typename M>
+std::pair<int, int> TileStringOneDimension(
+    const TfLiteIntArray& in_dimensions, const TfLiteTensor* in_data,
+    int in_data_index, const M* multipliers, DynamicBuffer* buffer,
+    int buffer_index, int dimension, TfLiteTensor* out_data) {
+  const int dimension_size = in_dimensions.data[dimension];
+  if (dimension == in_dimensions.size - 1) {
+    CopyStringMultipleTimes(in_data, in_data_index, dimension_size,
+                            multipliers[dimension], buffer);
+    return {dimension_size,
+            dimension_size * static_cast<int>(multipliers[dimension])};
+  }
+
+  int total_stride_size = 0, total_tiled_stride_size = 0;
+  for (int i = 0; i < dimension_size; ++i) {
+    int stride_size, tiled_stride_size;
+    std::tie(stride_size, tiled_stride_size) = TileStringOneDimension(
+        in_dimensions, in_data, in_data_index + total_stride_size, multipliers,
+        buffer, buffer_index + total_tiled_stride_size, dimension + 1,
+        out_data);
+    total_stride_size += stride_size;
+    total_tiled_stride_size += tiled_stride_size;
+  }
+
+  buffer->WriteToTensor(out_data, /*new_shape=*/nullptr);
+  CopyStringMultipleTimes(out_data, buffer_index, total_tiled_stride_size,
+                          multipliers[dimension] - 1, buffer);
+
+  return {total_stride_size,
+          total_tiled_stride_size * static_cast<int>(multipliers[dimension])};
+}
+
 template <typename T>
 void Tile(const TfLiteIntArray& in_dimensions, const TfLiteTensor* in_data,
           const TfLiteTensor* multipliers, TfLiteTensor* out_data) {
@@ -135,6 +179,26 @@ void Tile(const TfLiteIntArray& in_dimensions, const TfLiteTensor* in_data,
       break;
   }
 }
+
+void TileString(const TfLiteIntArray& in_dimensions,
+                const TfLiteTensor* in_data, const TfLiteTensor* multipliers,
+                DynamicBuffer* buffer, TfLiteTensor* out_data) {
+  // Doing recursively tiling from top to down dimension.
+  switch (multipliers->type) {
+    case kTfLiteInt32:
+      TileStringOneDimension(in_dimensions, in_data, 0,
+                             GetTensorData<int32_t>(multipliers), buffer, 0, 0,
+                             out_data);
+      break;
+    case kTfLiteInt64:
+      TileStringOneDimension(in_dimensions, in_data, 0,
+                             GetTensorData<int64_t>(multipliers), buffer, 0, 0,
+                             out_data);
+      break;
+    default:
+      break;
+  }
+}
 }  // namespace
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
@@ -185,6 +249,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt64:
       Tile<int64_t>(*(input->dims), input, multipliers, output);
       break;
+    case kTfLiteString: {
+      DynamicBuffer buffer;
+      TileString(*(input->dims), input, multipliers, &buffer, output);
+      buffer.WriteToTensor(output, /*new_shape=*/nullptr);
+      break;
+    }
     case kTfLiteBool:
       Tile<bool>(*(input->dims), input, multipliers, output);
       break;
diff --git a/tensorflow/lite/kernels/tile_test.cc b/tensorflow/lite/kernels/tile_test.cc
index 79b791c8c92..5a7461a8127 100644
--- a/tensorflow/lite/kernels/tile_test.cc
+++ b/tensorflow/lite/kernels/tile_test.cc
@@ -202,6 +202,54 @@ TEST_P(TileTest, Int64Matrix64Multipliers) {
       /*multiply_type=*/TensorType_INT64, GetParam());
 }
 
+TEST_P(TileTest, StringMatrix) {
+  // TODO(b/138722124): Enable these tests on NNAPI.
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+  Check<std::string>(
+      /*input_shape=*/{2, 3},
+      /*input_data=*/{"AA", "AB", "AC", "BA", "BB", "BC"},
+      /*multipliers_data=*/{1, 2}, /*exp_output_shape=*/{2, 6},
+      /*exp_output_data=*/
+      {"AA", "AB", "AC", "AA", "AB", "AC", "BA", "BB", "BC", "BA", "BB", "BC"},
+      /*input_type=*/TensorType_STRING,
+      /*multiply_type=*/TensorType_INT32, GetParam());
+}
+
+TEST_P(TileTest, StringMatrix64Multipliers) {
+  // TODO(b/138722124): Enable these tests on NNAPI.
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+  Check<std::string, int64_t>(
+      /*input_shape=*/{2, 3},
+      /*input_data=*/{"AA", "AB", "AC", "BA", "BB", "BC"},
+      /*multipliers_data=*/{2, 1}, /*exp_output_shape=*/{4, 3},
+      /*exp_output_data=*/
+      {"AA", "AB", "AC", "BA", "BB", "BC", "AA", "AB", "AC", "BA", "BB", "BC"},
+      /*input_type=*/TensorType_STRING,
+      /*multiply_type=*/TensorType_INT64, GetParam());
+}
+
+TEST_P(TileTest, StringMatrix2) {
+  // TODO(b/138722124): Enable these tests on NNAPI.
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+  Check<std::string>(
+      /*input_shape=*/{3, 2, 1},
+      /*input_data=*/{"AA", "AB", "AC", "BA", "BB", "BC"},
+      /*multipliers_data=*/{2, 2, 2}, /*exp_output_shape=*/{6, 4, 2},
+      /*exp_output_data=*/
+      {"AA", "AA", "AB", "AB", "AA", "AA", "AB", "AB", "AC", "AC", "BA", "BA",
+       "AC", "AC", "BA", "BA", "BB", "BB", "BC", "BC", "BB", "BB", "BC", "BC",
+       "AA", "AA", "AB", "AB", "AA", "AA", "AB", "AB", "AC", "AC", "BA", "BA",
+       "AC", "AC", "BA", "BA", "BB", "BB", "BC", "BC", "BB", "BB", "BC", "BC"},
+      /*input_type=*/TensorType_STRING,
+      /*multiply_type=*/TensorType_INT32, GetParam());
+}
+
 INSTANTIATE_TEST_SUITE_P(TileTest, TileTest,
                          ::testing::Values(TestType::kConst,
                                            TestType::kDynamic));
diff --git a/tensorflow/lite/testing/op_tests/tile.py b/tensorflow/lite/testing/op_tests/tile.py
index f486e059228..49d838c54ec 100644
--- a/tensorflow/lite/testing/op_tests/tile.py
+++ b/tensorflow/lite/testing/op_tests/tile.py
@@ -27,7 +27,7 @@ from tensorflow.lite.testing.zip_test_utils import register_make_test_function
 def make_tile_tests(options):
   """Make a set of tests to do tile."""
   test_parameters = [{
-      "input_dtype": [tf.float32, tf.int32, tf.bool],
+      "input_dtype": [tf.float32, tf.int32, tf.bool, tf.string],
       "input_shape": [[3, 2, 1], [2, 2, 2]],
       "multiplier_dtype": [tf.int32, tf.int64],
       "multiplier_shape": [[3]]
diff --git a/tensorflow/lite/toco/tflite/op_version.cc b/tensorflow/lite/toco/tflite/op_version.cc
index 49b7ed5c38d..09150d23f37 100644
--- a/tensorflow/lite/toco/tflite/op_version.cc
+++ b/tensorflow/lite/toco/tflite/op_version.cc
@@ -106,6 +106,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kPad, 1}, "1.5.0"},
           {{OperatorType::kPad, 2}, "1.14.0"},
           {{OperatorType::kTile, 1}, "1.10.1"},
+          {{OperatorType::kTile, 2}, kPendingReleaseOpVersion},
           {{OperatorType::kPadV2, 1}, "1.9.0"},
           {{OperatorType::kPadV2, 2}, "1.14.0"},
           {{OperatorType::kReshape, 1}, "1.5.0"},
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index 77c39ff7073..b699f0dbc9b 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -287,6 +287,12 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
 
+    case BuiltinOperator_TILE:
+      if (op_sig.input_types.at(0) == TensorType_STRING) {
+        return 2;
+      }
+      return 1;
+
     case BuiltinOperator_AVERAGE_POOL_2D:
     case BuiltinOperator_ADD:
     case BuiltinOperator_SPACE_TO_BATCH_ND:
diff --git a/tensorflow/lite/tools/versioning/op_version_test.cc b/tensorflow/lite/tools/versioning/op_version_test.cc
index b417fc5c47d..8cd873aa697 100644
--- a/tensorflow/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/lite/tools/versioning/op_version_test.cc
@@ -432,4 +432,17 @@ TEST(OpVersionTest, VersioningDepthwiseConv2DTest) {
   fake_op_sig.options.depthwise_conv_2d.dilation_h_factor = 1;
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
 }
+TEST(OpVersionTest, VersioningTileOperatorTest) {
+  OpSignature fake_op_sig = {
+      .op = BuiltinOperator_TILE,
+      .input_types = std::vector<TensorType>{TensorType_INT32},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+
+  fake_op_sig = {
+      .op = BuiltinOperator_TILE,
+      .input_types = std::vector<TensorType>{TensorType_STRING},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+}
 }  // namespace tflite

From 9eb62ad4f8454e284e903229b5da8f300ad108ed Mon Sep 17 00:00:00 2001
From: Tiezhen WANG <wangtz@google.com>
Date: Thu, 20 Feb 2020 18:27:25 -0800
Subject: [PATCH 423/442] TFLM: Reduce the latency for Reshape operator. This
 is achieved by moving shape check to prepare so that it's ran only once.

PiperOrigin-RevId: 296343354
Change-Id: Ie72628b5abf8cc949dd4c8d1190007bab5f0ff1e
---
 tensorflow/lite/micro/kernels/reshape.cc      | 12 +++++++-----
 tensorflow/lite/micro/kernels/reshape_test.cc |  8 +++++++-
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/reshape.cc b/tensorflow/lite/micro/kernels/reshape.cc
index d7a5a6181fb..376c612ef59 100644
--- a/tensorflow/lite/micro/kernels/reshape.cc
+++ b/tensorflow/lite/micro/kernels/reshape.cc
@@ -69,18 +69,20 @@ TfLiteStatus ReshapeOutput(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, ReshapeOutput(context, node), kTfLiteOk);
   return kTfLiteOk;
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  if (ReshapeOutput(context, node) != kTfLiteOk) {
-    return kTfLiteError;
-  }
 
-  for (size_t i = 0; i < input->bytes; ++i) {
-    output->data.raw[i] = input->data.raw[i];
+  // Do nothing for in-place reshape.
+  if (input->data.raw != output->data.raw) {
+    // Otherwise perform reshape with copy.
+    for (size_t i = 0; i < input->bytes; ++i) {
+      output->data.raw[i] = input->data.raw[i];
+    }
   }
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/micro/kernels/reshape_test.cc b/tensorflow/lite/micro/kernels/reshape_test.cc
index e252e13fa50..16d70a0159e 100644
--- a/tensorflow/lite/micro/kernels/reshape_test.cc
+++ b/tensorflow/lite/micro/kernels/reshape_test.cc
@@ -77,7 +77,13 @@ void TestReshapeImpl(TfLiteTensor* input_tensor, TfLiteTensor* shape_tensor,
   TF_LITE_MICRO_EXPECT_EQ(registration->free, nullptr);
 
   if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+    // Error can happen either in Prepare or eval stage.
+    auto status = registration->prepare(&context, &node);
+    if (status == kTfLiteError && expect_failure) {
+      return;
+    } else {
+      TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, status);
+    }
   }
   if (expect_failure) {
     TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,

From c5e211e8b56033485cccb9395fd3a3c55c677a30 Mon Sep 17 00:00:00 2001
From: Michael Gester <mgester@google.com>
Date: Thu, 20 Feb 2020 18:28:58 -0800
Subject: [PATCH 424/442] Fixed control dependency errors in BreakUpIslands

Previously, some necessary control dependencies after breaking up islands were
missed (e.g., a dependency between a newly created island and a SwitchNOp), and
"Adding control dependency not supported" errors were reported in such cases.
Fixed this and added tests that contain all previously problematic ops and check
that control dependencies are now correctly added.

PiperOrigin-RevId: 296343508
Change-Id: I332ae3e7cf0483129063c4f520174617ec4ebf1a
---
 .../mlir/tensorflow/ir/tf_executor.cc         |  4 +-
 .../mlir/tensorflow/ir/tf_executor_ops.td     |  4 +-
 .../tensorflow/tests/breakup-islands.mlir     | 64 +++++++++++++++++++
 .../tensorflow/translate/breakup-islands.cc   | 15 +++--
 4 files changed, 77 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
index 4b6ff55e5ea..c6144ec21e3 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
@@ -573,9 +573,9 @@ void Print(SwitchNOp switchn, OpAsmPrinter &p) {
 
 ParseResult ParseSwitchNOp(OpAsmParser &parser, OperationState &result) {
   // Parsing:
-  //       %2:6 = tf_executor.SwitchN %0, %1 by 5 : tensor<??xf32>
+  //       %2:6 = tf_executor.SwitchN %0, %1 of 5 : tensor<??xf32>
   // Where the first operand is the data to replicate, the second is an i32
-  // indicating which output to populate, followed by the keyword `by` and the
+  // indicating which output to populate, followed by the keyword `of` and the
   // number of outputs (+1 for the control token).
   SmallVector<OpAsmParser::OperandType, 2> op_infos;
   SmallVector<Type, 1> types;
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
index 0987ae3d668..38f72f24bd1 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
@@ -165,7 +165,7 @@ def TfExecutor_IslandOp : TfExecutor_Op<"island",
     The `tf_executor.island` operation has a single region with a single block
     attached (only functional control flow is allowed). The block is terminated
     by a `tf_executor.yield` operation. The operands of the terminator
-    correspond to the result values of the `tf_executor.graph` operation. An
+    correspond to the result values of the `tf_executor.island` operation. An
     extra result of type `!tf_executor.control` is always produced by every
     `tf_executor.island`.
     Within an island, execution semantics follow standard sequential behavior as
@@ -299,7 +299,7 @@ def TfExecutor_SwitchNOp : TfExecutor_Op<"SwitchN",
         .SetShapeFn(SwitchNShape);
 
     For example:
-      %2:6 = tf_executor.SwitchN %0, %1 by 5 : tensor<??xf32>
+      %2:6 = tf_executor.SwitchN %0, %1 of 5 : tensor<??xf32>
 
     Note: One additional result corresponds to the control output.
   }];
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir b/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
index 8659f52e301..61e0772726c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
@@ -280,3 +280,67 @@ func @empty_island_multiple_data_results(%arg0: tensor<*xf32>, %arg1: tensor<*xi
   }
   return
 }
+
+// The following tests check that certain control dependencies between islands
+// and certain tf_executor ops are added correctly.
+
+// CHECK: %[[CONTROL:[^ ,]*]] = tf_executor.island wraps "tf.Print"
+// CHECK: tf_executor.NextIteration.Sink [{{.*}}] {{.*}}, %[[CONTROL]]
+func @next_iteration_sink_control_input() {
+  tf_executor.graph {
+    %source:3 = tf_executor.NextIteration.Source : tensor<*xi32>
+    %island:2 = tf_executor.island {
+      %const = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<*xi32>
+      %print = "tf.Print"(%const) : (tensor<*xi32>) -> (tensor<*xi32>)
+      tf_executor.yield %const : tensor<*xi32>
+    }
+    tf_executor.NextIteration.Sink[%source#1] %island#0 : tensor<*xi32>
+    tf_executor.fetch %island#0 : tensor<*xi32>
+  }
+  return
+}
+
+// CHECK: %[[CONTROL:[^ ,]*]] = tf_executor.island wraps "tf.Print"
+// CHECK: tf_executor.LoopCond {{.*}}, %[[CONTROL]]
+func @loop_cond_control_input() {
+  tf_executor.graph {
+    %island:2 = tf_executor.island {
+      %const = "tf.Const"() {value = dense<1> : tensor<i1>} : () -> tensor<*xi1>
+      %print = "tf.Print"(%const) : (tensor<*xi1>) -> (tensor<*xi1>)
+      tf_executor.yield %const : tensor<*xi1>
+    }
+    %loop_cond:2 = tf_executor.LoopCond %island#0 : tensor<*xi1>
+    tf_executor.fetch %loop_cond#0 : tensor<*xi1>
+  }
+  return
+}
+
+// CHECK: %[[CONTROL:[^ ,]*]] = tf_executor.island wraps "tf.Print"
+// CHECK: tf_executor.Enter {{.*}}, %[[CONTROL]]
+func @enter_control_input() {
+  tf_executor.graph {
+    %island:2 = tf_executor.island {
+      %const = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<*xi32>
+      %print = "tf.Print"(%const) : (tensor<*xi32>) -> (tensor<*xi32>)
+      tf_executor.yield %const : tensor<*xi32>
+    }
+    %enter:2 = tf_executor.Enter %island#0 frame "some/frame" : tensor<*xi32>
+    tf_executor.fetch %enter#0 : tensor<*xi32>
+  }
+  return
+}
+
+// CHECK: %[[CONTROL:[^ ,]*]] = tf_executor.island wraps "tf.Print"
+// CHECK: tf_executor.SwitchN {{.*}}, {{.*}} of {{[0-9]*}} (%[[CONTROL]])
+func @switchn_control_input(%arg1: tensor<i32>) {
+  tf_executor.graph {
+    %island:2 = tf_executor.island {
+      %const = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<*xi32>
+      %print = "tf.Print"(%const) : (tensor<*xi32>) -> (tensor<*xi32>)
+      tf_executor.yield %const : tensor<*xi32>
+    }
+    %switchn:4 = tf_executor.SwitchN %island#0, %arg1 of 3: tensor<*xi32>
+    tf_executor.fetch %switchn#0 : tensor<*xi32>
+  }
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc b/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
index d40eec62cdc..8136db7d164 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
@@ -60,7 +60,7 @@ void BreakUpIslands::runOnFunction() {
         getOperation().getBody().front().front());
   }
   if (!graph_op) {
-    getOperation().emitError("Expected function to contain only a graph_op");
+    getOperation().emitError("expected function to contain only a graph_op");
     signalPassFailure();
     return;
   }
@@ -239,7 +239,7 @@ void BreakUpIslands::BreakUpIsland(
     } else {
       // TODO(parkers): Any defining op that has a control output can be handled
       // just like an island.
-      fetch.getDefiningOp()->emitError("Fetching non-island as dependency.");
+      fetch.getDefiningOp()->emitError("fetching non-island as dependency");
       return signalPassFailure();
     }
   }
@@ -298,18 +298,21 @@ void BreakUpIslands::BreakUpIsland(
   auto& sink_island_control = sink_island_controls[0];
   island_op.control().replaceAllUsesWith(sink_island_control);
   // All existing outputs need to add sink_island_control as control input.
+  // GraphOp, YieldOp and NextIterationSourceOp don't have control inputs so
+  // exclude them below.
   for (Value out : island_op.outputs()) {
     for (auto& use : out.getUses()) {
       Operation* owner = use.getOwner();
       if (auto other_island_op =
               llvm::dyn_cast<tf_executor::IslandOp>(owner->getParentOp())) {
         (*new_control_inputs)[other_island_op].push_back(sink_island_control);
-      } else if (llvm::isa<tf_executor::FetchOp>(owner) ||
-                 llvm::isa<tf_executor::MergeOp>(owner) ||
-                 llvm::isa<tf_executor::SwitchOp>(owner)) {
+      } else if (owner->getDialect() == island_op.getDialect() &&
+                 !llvm::isa<tf_executor::GraphOp>(owner) &&
+                 !llvm::isa<tf_executor::YieldOp>(owner) &&
+                 !llvm::isa<tf_executor::NextIterationSourceOp>(owner)) {
         (*new_control_inputs)[owner].push_back(sink_island_control);
       } else {
-        use.getOwner()->emitError("Adding control dependency not supported");
+        owner->emitOpError("adding control dependency not supported");
         return signalPassFailure();
       }
     }

From d33655922c8a5007c9f259252d16eec46bd66fff Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 20 Feb 2020 18:36:32 -0800
Subject: [PATCH 425/442] [TF:MLIR] Add support for folding Transpose into
 FusedBatchNormV3

PiperOrigin-RevId: 296344377
Change-Id: I768b18534b17e8c93994279b12e72650f2f0858c
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    |  9 +-
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     | 97 +++++++++++++------
 ...timization_layout_assignment_to_nchw.mlir} |  0
 ...ptimization_layout_assignment_to_nhwc.mlir | 35 +++++++
 ...yout_optimization_move_transposes_end.mlir | 29 +++++-
 5 files changed, 136 insertions(+), 34 deletions(-)
 rename tensorflow/compiler/mlir/tensorflow/tests/{layout_optimization_layout_assignment.mlir => layout_optimization_layout_assignment_to_nchw.mlir} (100%)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 191e0afbdee..77997b8002d 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -2099,7 +2099,7 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
   TF_DerivedOperandTypeAttr U = TF_DerivedOperandTypeAttr<3>;
 }
 
-def TF_FusedBatchNormV3Op : TF_Op<"FusedBatchNormV3", [NoSideEffect]> {
+def TF_FusedBatchNormV3Op : TF_Op<"FusedBatchNormV3", [NoSideEffect, TF_FoldOperandsTransposeInterface]> {
   let summary = "Batch normalization.";
 
   let description = [{
@@ -2130,6 +2130,13 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
   TF_DerivedOperandTypeAttr U = TF_DerivedOperandTypeAttr<1>;
+
+  let extraClassDeclaration = [{
+    // TF_FoldOperandsTransposeInterface:
+    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
+    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {0}; }
+    LogicalResult FoldOperandsPermutation(ArrayRef<int64_t> permutation);
+  }];
 }
 
 def TF_GatherOp : TF_Op<"Gather", [NoSideEffect]> {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index d4e59d7d1ee..0cc6850b813 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -300,7 +300,7 @@ SmallVector<int64_t, 4> GetDataFormatPermutation(StringRef from, StringRef to) {
   if (from == "NHWC" && to == "NCHW") {
     return {0, 3, 1, 2};
   } else if (from == "NCHW" && to == "NHWC") {
-    return {0, 1, 2, 3};
+    return {0, 2, 3, 1};
   } else {
     return {};
   }
@@ -385,6 +385,63 @@ LogicalResult UpdateDataFormat(StringRef data_format, Op *op) {
   return success();
 }
 
+// Default implementation for folding operand transpose into the operation.
+// See `FoldOperandsTransposeInterface::FoldOperandsPermutation`.
+template <typename Op>
+LogicalResult FoldOperandsPermutation(
+    ArrayRef<int64_t> permutation, Op *op,
+    ArrayRef<std::pair<StringRef, ArrayAttr>> shuffle_attrs = {}) {
+  MLIRContext *context = op->template getParentOfType<ModuleOp>().getContext();
+
+  // We only support NHWC <-> NCHW permutations.
+  static constexpr std::array<int64_t, 4> kNchwToNhwc = {0, 2, 3, 1};
+  static constexpr std::array<int64_t, 4> kNhwcToNchw = {0, 3, 1, 2};
+
+  // Operation data format after folding `permutation`.
+  StringRef target_data_format = [&]() -> StringRef {
+    if (op->data_format() == "NHWC" && permutation.equals(kNchwToNhwc)) {
+      return "NCHW";  // cancel NCHW->NHWC operand permutation
+    } else if (op->data_format() == "NCHW" && permutation.equals(kNhwcToNchw)) {
+      return "NHWC";  // cancel NHWC->NCHW operand permutation
+    } else {
+      return "";
+    }
+  }();
+  if (target_data_format.empty()) return failure();
+
+  // To fold operand `permutation` into the `op` we need shuffle all layout
+  // dependent attributes and types with a reverse permutation, and change
+  // operation data format to `target_data_format`.
+  //
+  // Example:
+  //   %1 = SomeOp(...)   {data_format = NHWC}
+  //   %2 = Transpose(%1) {permutation = NHWC->NCHW}
+  //   %3 = Op(%2)        {data_format = NCHW}
+  //
+  // To bypass %2 we have to change data format to shuffle data format from NCHW
+  // to NHWC, which is the reverse of operand permutation (function argument).
+  auto reverse_permutation =
+      GetDataFormatPermutation(op->data_format(), target_data_format);
+  if (reverse_permutation.empty()) return failure();
+
+  op->setAttr("data_format", StringAttr::get(target_data_format, context));
+
+  for (auto pair : shuffle_attrs) {
+    StringRef attr_name = pair.first;
+    ArrayAttr attr_value = pair.second;
+    op->setAttr(attr_name, ShuffleArrayAttr(attr_value, reverse_permutation));
+  }
+
+  auto fold = cast<FoldOperandsTransposeInterface>(op->getOperation());
+  for (unsigned idx : fold.GetLayoutDependentResults()) {
+    OpResult result = op->getOperation()->getResult(idx);
+    result.setType(
+        ShuffleRankedTensorType(result.getType(), reverse_permutation));
+  }
+
+  return success();
+}
+
 namespace {
 #include "tensorflow/compiler/mlir/tensorflow/transforms/generated_canonicalize.inc"
 }  // namespace
@@ -1255,6 +1312,11 @@ static LogicalResult Verify(FusedBatchNormOp op) {
   return success();
 }
 
+LogicalResult FusedBatchNormV3Op::FoldOperandsPermutation(
+    ArrayRef<int64_t> permutation) {
+  return ::mlir::TF::FoldOperandsPermutation(permutation, this);
+}
+
 //===----------------------------------------------------------------------===//
 // GatherV2Op
 //===----------------------------------------------------------------------===//
@@ -1453,37 +1515,8 @@ void MaxOp::build(Builder *builder, OperationState &result, Value input,
 
 LogicalResult MaxPoolOp::FoldOperandsPermutation(
     ArrayRef<int64_t> permutation) {
-  MLIRContext *context = getParentOfType<ModuleOp>().getContext();
-
-  // Data format after folding permutation.
-  StringRef target_data_format;
-
-  // For now we only support folding of NCHW->NHWC and NHWC->NCHW permutations.
-  if (data_format() == "NHWC") {
-    static constexpr std::array<int64_t, 4> kPerm = {0, 2, 3, 1};  // to NHWC
-    if (permutation != ArrayRef<int64_t>(kPerm)) return failure();
-    target_data_format = "NCHW";
-
-  } else if (data_format() == "NCHW") {
-    static constexpr std::array<int64_t, 4> kPerm = {0, 3, 1, 2};  // to NCHW
-    if (permutation != ArrayRef<int64_t>(kPerm)) return failure();
-    target_data_format = "NHWC";
-
-  } else {
-    return failure();
-  }
-
-  auto perm = GetDataFormatPermutation(data_format(), target_data_format);
-  if (perm.empty()) return failure();
-
-  setAttr("data_format", StringAttr::get(target_data_format, context));
-  setAttr("strides", ShuffleArrayAttr(strides(), perm));
-  setAttr("ksize", ShuffleArrayAttr(ksize(), perm));
-
-  OpResult result = getOperation()->getResult(0);
-  result.setType(ShuffleRankedTensorType(result.getType(), perm));
-
-  return success();
+  return ::mlir::TF::FoldOperandsPermutation(
+      permutation, this, {{"strides", strides()}, {"ksize", ksize()}});
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nchw.mlir
similarity index 100%
rename from tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment.mlir
rename to tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nchw.mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir
new file mode 100644
index 00000000000..2d87d5ccd9c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir
@@ -0,0 +1,35 @@
+// RUN: tf-opt %s -tf-layout-assignment=force-data-format=NHWC -verify-diagnostics | FileCheck %s --dump-input=always
+
+// CHECK-LABEL: func @transposeConv2D
+func @transposeConv2D(%input: tensor<1x3x32x32xf32>, %filter: tensor<1x1x3x8xf32>) -> tensor<1x8x32x32xf32> {
+
+  // IMPORTANT: Tensor shapes do not match convolution parameters (stride,
+  // dilations, etc...). This test only verifies that changing convolution data
+  // layout will update all the attributes.
+
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
+  // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
+
+  // CHECK: %[[CONV2D:[0-9]*]] = "tf.Conv2D"(%[[ARG_TRANSPOSE]], %arg1)
+  // CHECK-SAME: data_format = "NHWC"
+  // CHECK-SAME: dilations = [1, 3, 4, 2]
+  // CHECK-SAME: explicit_paddings = [1, 2, 5, 6, 7, 8, 3, 4]
+  // CHECK-SAME: padding = "EXPLICIT"
+  // CHECK-SAME: strides = [5, 7, 8, 6]
+  // CHECK-SAME: (tensor<1x32x32x3xf32>, tensor<1x1x3x8xf32>) -> tensor<1x32x32x8xf32>
+
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
+  // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[CONV2D]], %[[RES_PERM]])
+  // CHECK: return %[[RES_TRANSPOSE]]
+
+  %0 = "tf.Conv2D"(%input, %filter)
+       {
+         data_format = "NCHW",
+         dilations = [1, 2, 3, 4],
+         explicit_paddings = [1, 2, 3, 4, 5, 6, 7, 8],
+         padding = "EXPLICIT",
+         strides = [5, 6, 7, 8]
+       } : (tensor<1x3x32x32xf32>, tensor<1x1x3x8xf32>) -> tensor<1x8x32x32xf32>
+
+  return %0 : tensor<1x8x32x32xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir
index 10fc70683b3..d89f5cbdf98 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir
@@ -56,7 +56,7 @@ func @fold_into_max_pool(%arg0: tensor<1x64x112x112xf32>) -> tensor<1x56x56x64xf
 
   // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
   // CHECK: %[[MAX_POOL:[0-9]*]] = "tf.MaxPool"(%arg0) {data_format = "NCHW", ksize = [1, 1, 3, 3], padding = "SAME", strides = [1, 1, 2, 2]} : (tensor<1x64x112x112xf32>) -> tensor<1x64x56x56xf32>
-  // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[ADD]], %[[RES_PERM]])
+  // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[MAX_POOL]], %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
   // Transpose NCHW -> NHWC
@@ -72,3 +72,30 @@ func @fold_into_max_pool(%arg0: tensor<1x64x112x112xf32>) -> tensor<1x56x56x64xf
 
   return %2 : tensor<1x56x56x64xf32>
 }
+
+// CHECK-LABEL: func @fold_into_fused_batch_norm
+func @fold_into_fused_batch_norm(%arg0: tensor<1x64x112x112xf32>, %arg1: tensor<64xf32>) -> tensor<1x112x112x64xf32> {
+
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
+  // CHECK: "tf.FusedBatchNormV3"(%arg0, {{.*}} {data_format = "NCHW"
+  // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%y, %[[RES_PERM]])
+  // CHECK: return %[[RES_TRANSPOSE]]
+
+  // Transpose NCHW -> NHWC
+  %0 = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>} : () -> tensor<4xi64>
+  %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x64x112x112xf32>, tensor<4xi64>) -> tensor<1x112x112x64xf32>
+
+  // Compute FusedBatchNormV3 in NHWC format
+  %2, %batch_mean, %batch_var, %reserve_1, %reserve_2, %reserve_3
+    = "tf.FusedBatchNormV3"(%1, %arg1, %arg1, %arg1, %arg1)
+       {
+         data_format = "NHWC",
+         epsilon = 1.001 : f32,
+         exponential_avg_factor = 1.0 : f32,
+         is_training = false
+       }
+        : (tensor<1x112x112x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
+       -> (tensor<1x112x112x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
+
+  return %2#0 : tensor<1x112x112x64xf32>
+}
\ No newline at end of file

From 57c49e17b197138e5d50726ebf4152f5f80f15ee Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Thu, 20 Feb 2020 18:43:00 -0800
Subject: [PATCH 426/442] Update ophint optional inputs logic (fill in optional
 input with constant nodes), also update the toco unidirectional lstm
 legalization.

PiperOrigin-RevId: 296345130
Change-Id: I5b2b8262a9495ff4f3c46f35724c030a914949f6
---
 tensorflow/lite/python/op_hint.py         | 40 ++++++++++++++-----
 tensorflow/lite/toco/import_tensorflow.cc | 47 +++++++++++++++--------
 2 files changed, 62 insertions(+), 25 deletions(-)

diff --git a/tensorflow/lite/python/op_hint.py b/tensorflow/lite/python/op_hint.py
index 5aa212a573f..3674135721a 100644
--- a/tensorflow/lite/python/op_hint.py
+++ b/tensorflow/lite/python/op_hint.py
@@ -79,7 +79,9 @@ import six as _six
 from tensorflow.core.framework import attr_value_pb2 as _attr_value_pb2
 from tensorflow.core.framework import graph_pb2 as _graph_pb2
 from tensorflow.core.framework import node_def_pb2 as _node_def_pb2
+from tensorflow.python.framework import dtypes as _dtypes
 from tensorflow.python.framework import ops as _ops
+from tensorflow.python.framework import tensor_util as _tensor_util
 # TODO(aselle): publicize these apis if we continue to use these.
 from tensorflow.python.framework.graph_util_impl import _bfs_for_reachable_nodes
 from tensorflow.python.framework.graph_util_impl import _extract_graph_summary
@@ -996,10 +998,26 @@ def _convert_single_op_hint_to_stub(call,
   # Delegate to each operand to produce the proper new input for this stub node.
   # In particular, an aggregate input will now be a Pack of some previously
   # non-fused things.
-  for input_index in sorted_input_indices:
-    inputs = call.inputs[input_index]
-    input_name = inputs.aggregate_and_return_name_for_input(out)
-    new_node.input.append(input_name)
+
+  optional_input_node = _node_def_pb2.NodeDef()
+  optional_input_node.name = "Const" + str(_uuid.uuid1().hex)
+  optional_input_node.op = "Const"
+  optional_input_node.attr["dtype"].CopyFrom(
+      _attr_value_pb2.AttrValue(type=_dtypes.float32.as_datatype_enum))
+  optional_input_node.attr["value"].CopyFrom(
+      _attr_value_pb2.AttrValue(
+          tensor=_tensor_util.make_tensor_proto([-1], _dtypes.float32, [1])))
+  out.node.extend([optional_input_node])
+
+  max_index = max(sorted_input_indices) + 1
+  for cur_index in range(max_index):
+    if cur_index in sorted_input_indices:
+      inputs = call.inputs[cur_index]
+      input_name = inputs.aggregate_and_return_name_for_input(out)
+      new_node.input.append(input_name)
+    else:
+      new_node.input.append(optional_input_node.name)
+
   new_node.attr[OpHint.TFLITE_INPUT_INDICES].list.i.extend(sorted_input_indices)
 
   # Create the function
@@ -1010,11 +1028,15 @@ def _convert_single_op_hint_to_stub(call,
   # Now call each output argument to give them a chance to make the proper
   # output type and add it to our new_node.
   output_dtypes = []
-  for output_index in sorted_output_indices:
-    output = call.outputs[output_index]
-    output_dtype = (
-        output.aggregate_and_return_name_for_output(new_node.name, output_index,
-                                                    out))
+  max_output_index = max(sorted_output_indices) + 1
+  for cur_index in range(max_output_index):
+    if cur_index in sorted_output_indices:
+      output = call.outputs[cur_index]
+      output_dtype = (
+          output.aggregate_and_return_name_for_output(new_node.name, cur_index,
+                                                      out))
+    else:
+      output_dtype = optional_input_node.attr["type"].i
     output_dtypes.append(output_dtype)
   new_node.attr["_output_types"].list.type[:] = output_dtypes
   # TODO(aselle): what is right here?
diff --git a/tensorflow/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc
index d69c787652e..293fc654084 100644
--- a/tensorflow/lite/toco/import_tensorflow.cc
+++ b/tensorflow/lite/toco/import_tensorflow.cc
@@ -2410,9 +2410,6 @@ tensorflow::Status ConvertUnidirectionalSequenceLstm(
   DCHECK_EQ(node.op(), "UnidirectionalSequenceLstm");
 
   const auto& indices = GetListAttr(node, "_tflite_input_indices");
-  if (indices.i_size() != node.input().size()) {
-    return tensorflow::errors::InvalidArgument("Input size does not match.");
-  }
 
   auto* op = new UnidirectionalSequenceLstmOperator();
 
@@ -2421,20 +2418,38 @@ tensorflow::Status ConvertUnidirectionalSequenceLstm(
   const int kInputsSize = 20;
 
   op->inputs.resize(kInputsSize);
-  std::vector<bool> done(kInputsSize);
-  int idx = 0;
-  for (const string& input : node.input()) {
-    int real_index = indices.i(idx);
-    op->inputs[real_index] = (input);
-    done[real_index] = true;
-    idx++;
-  }
 
-  for (int idx = 0; idx < done.size(); idx++) {
-    if (!done[idx]) {
-      string optional_name = node.name() + "_" + std::to_string(idx);
-      model->CreateOptionalArray(optional_name);
-      op->inputs[idx] = optional_name;
+  if (indices.i_size() != node.input().size()) {
+    // New version, the optional inputs are filled with constant nodes.
+    int count = 0;
+    for (int idx = 0; idx < kInputsSize; ++idx) {
+      if (count < indices.i_size() && indices.i(count) == idx) {
+        // Specified input.
+        op->inputs[idx] = node.input(idx);
+        count++;
+      } else {
+        // Optional input.
+        string optional_name = node.name() + "_" + std::to_string(idx);
+        model->CreateOptionalArray(optional_name);
+        op->inputs[idx] = optional_name;
+      }
+    }
+  } else {  // Legacy version.
+    std::vector<bool> done(kInputsSize);
+    int idx = 0;
+    for (const string& input : node.input()) {
+      int real_index = indices.i(idx);
+      op->inputs[real_index] = (input);
+      done[real_index] = true;
+      idx++;
+    }
+
+    for (int idx = 0; idx < done.size(); idx++) {
+      if (!done[idx]) {
+        string optional_name = node.name() + "_" + std::to_string(idx);
+        model->CreateOptionalArray(optional_name);
+        op->inputs[idx] = optional_name;
+      }
     }
   }
 

From d9444a76c0db31d205f6f8ff12997ad7fc777aa9 Mon Sep 17 00:00:00 2001
From: Terry Heo <terryheo@google.com>
Date: Thu, 20 Feb 2020 19:07:48 -0800
Subject: [PATCH 427/442] Fix crashing on GPU elementwise ops

Pass ElementwiseAttributes para as a pointer to check if it's valid or not.

PiperOrigin-RevId: 296348071
Change-Id: Ia0a4149605d5fbff5f6a08176ea7eb004bb23315
---
 .../lite/delegates/gpu/cl/kernels/elementwise.cc | 16 +++++++++-------
 .../lite/delegates/gpu/cl/kernels/elementwise.h  |  2 +-
 .../gpu/cl/selectors/operation_selector.cc       |  4 ++--
 .../lite/delegates/gpu/gl/kernels/elementwise.cc | 12 +++++++-----
 4 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
index 9fb3e45fe81..95db70a82f2 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
@@ -213,14 +213,16 @@ Status ElementwiseTwoInput::BindArguments(CLKernel* kernel) {
 ElementwiseTwoInput CreateElementwiseTwoInput(
     const CreationContext& creation_context, const OperationDef& definition,
     const OperationType& op_type, const BroadcastSettings& broadcast,
-    const ElementwiseAttributes& attr) {
+    const ElementwiseAttributes* attr) {
   ElementwiseTwoInput operation(definition, op_type, broadcast);
-  auto scalar = absl::get_if<float>(&attr.param);
-  if (scalar) {
-    const auto scalar_precision = creation_context.device->IsPowerVR()
-                                      ? CalculationsPrecision::F32
-                                      : definition.precision;
-    operation.SetScalarPara(FLT(scalar_precision, *scalar));
+  if (attr) {
+    const float* scalar = absl::get_if<float>(&attr->param);
+    if (scalar) {
+      const auto scalar_precision = creation_context.device->IsPowerVR()
+                                        ? CalculationsPrecision::F32
+                                        : definition.precision;
+      operation.SetScalarPara(FLT(scalar_precision, *scalar));
+    }
   }
   operation.SetLinkIndex(0);
   return operation;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
index a70114d1081..8bf33b0c128 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
@@ -92,7 +92,7 @@ class ElementwiseTwoInput : public ElementwiseOperation {
 ElementwiseTwoInput CreateElementwiseTwoInput(
     const CreationContext& creation_context, const OperationDef& definition,
     const OperationType& op_type, const BroadcastSettings& broadcast,
-    const ElementwiseAttributes& attr);
+    const ElementwiseAttributes* attr);
 
 ElementwiseTwoInput CreateElementwiseTwoInput(
     const OperationDef& definition, const OperationType& op_type,
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
index 2219a6b0c50..00f2fba49e9 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
@@ -246,8 +246,8 @@ Status GPUOperationFromNode(const CreationContext& creation_context,
       broadcast.width = IsWidthBroadcastedForSecondInput(inputs);
       broadcast.height = IsHeightBroadcastedForSecondInput(inputs);
       broadcast.channels = IsChannelsBroadcastedForSecondInput(inputs);
-      const auto attr =
-          absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
+      const ElementwiseAttributes* attr =
+          absl::any_cast<ElementwiseAttributes>(&node.operation.attributes);
       ElementwiseTwoInput operation = CreateElementwiseTwoInput(
           creation_context, op_def, op_type, broadcast, attr);
       *gpu_op = absl::make_unique<ElementwiseTwoInput>(std::move(operation));
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
index 7ba2dd871e7..34ab756e141 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
@@ -258,11 +258,13 @@ class ElementwiseTwoArguments : public NodeShader {
     if (IsSupportedBroadcast(ctx)) {
       return ImplementElementwiseBroadcast(ctx, generated_code);
     }
-    auto attr =
-        absl::any_cast<ElementwiseAttributes>(ctx.node->operation.attributes);
-    auto scalar = absl::get_if<float>(&attr.param);
-    if (scalar) {
-      return ImplementElementwiseWithScalar(ctx, *scalar, generated_code);
+    const ElementwiseAttributes* attr =
+        absl::any_cast<ElementwiseAttributes>(&ctx.node->operation.attributes);
+    if (attr) {
+      auto scalar = absl::get_if<float>(&attr->param);
+      if (scalar) {
+        return ImplementElementwiseWithScalar(ctx, *scalar, generated_code);
+      }
     }
     return InvalidArgumentError(
         "This case is not supported by elementwise with two arguments "

From 120e5e6ea0de434b17e63f22403fa4a954f6205b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 20 Feb 2020 19:42:34 -0800
Subject: [PATCH 428/442] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 296351967
Change-Id: I84b026ad9fc32992818caa452fede88732faae39
---
 tensorflow/go/op/wrappers.go | 78 ++++++++++++++++++++++++++----------
 1 file changed, 56 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 449a95765a5..b97c2734a6a 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21345,7 +21345,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22053,7 +22053,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22249,7 +22249,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22318,7 +22318,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22433,7 +22433,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22492,7 +22492,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22666,7 +22666,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22857,7 +22857,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25297,7 +25297,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25629,7 +25629,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25679,7 +25679,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25929,7 +25929,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26559,7 +26559,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27624,7 +27624,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -37603,6 +37603,40 @@ func RecvTPUEmbeddingActivations(scope *Scope, num_outputs int64, config string)
 	return outputs
 }
 
+// QuantizeAndDequantizeV2GradAttr is an optional argument to QuantizeAndDequantizeV2Grad.
+type QuantizeAndDequantizeV2GradAttr func(optionalAttr)
+
+// QuantizeAndDequantizeV2GradAxis sets the optional axis attribute to value.
+// If not specified, defaults to -1
+func QuantizeAndDequantizeV2GradAxis(value int64) QuantizeAndDequantizeV2GradAttr {
+	return func(m optionalAttr) {
+		m["axis"] = value
+	}
+}
+
+// Returns the gradient of `QuantizeAndDequantizeV2`.
+//
+// Returns a gradient of 1 for inputs that are within the quantization range,
+// or 0 otherwise.
+func QuantizeAndDequantizeV2Grad(scope *Scope, gradients tf.Output, input tf.Output, input_min tf.Output, input_max tf.Output, optional ...QuantizeAndDequantizeV2GradAttr) (input_backprop tf.Output, input_min_backprop tf.Output, input_max_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeAndDequantizeV2Grad",
+		Input: []tf.Input{
+			gradients, input, input_min, input_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Computes the sparse Cholesky decomposition of `input`.
 //
 // Computes the Sparse Cholesky decomposition of a sparse matrix, with the given
@@ -45536,7 +45570,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 2ca35b7a30df39582c1c37cc06c1d13b9d0a2ecb Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 20 Feb 2020 21:11:35 -0800
Subject: [PATCH 429/442] [TF:MLIR] Add support for folding Transpose into Mean

PiperOrigin-RevId: 296361326
Change-Id: I677bfd6aa17865514a8770b49bce6b7681d5c289
---
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     | 32 +++++++++++++++++++
 .../compiler/mlir/tensorflow/ir/tf_ops.td     |  9 +++++-
 ...yout_optimization_move_transposes_end.mlir | 19 +++++++++++
 3 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 0cc6850b813..b206b281754 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -1536,6 +1536,38 @@ static LogicalResult Verify(MaxPoolGradOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// MeanOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult MeanOp::FoldOperandsPermutation(ArrayRef<int64_t> permutation) {
+  // Reduction indices must be defined by a constant operation.
+  auto reduction_op =
+      dyn_cast_or_null<TF::ConstOp>(reduction_indices().getDefiningOp());
+  if (!reduction_op) return failure();
+
+  auto reductions_value = reduction_op.value().dyn_cast<DenseElementsAttr>();
+  if (!reductions_value) return failure();
+
+  // Prepare new reduction indices according to operand permutation.
+  SmallVector<int64_t, 4> shuffled_reduction;
+  llvm::transform(reductions_value.getIntValues(),
+                  std::back_inserter(shuffled_reduction),
+                  [&](APInt idx) { return permutation[idx.getSExtValue()]; });
+
+  // Add constant operation with a new reduction indices.
+  OpBuilder builder(getOperation());
+  auto type = mlir::RankedTensorType::get(shuffled_reduction.size(),
+                                          builder.getIntegerType(64));
+  auto values = mlir::DenseIntElementsAttr::get(type, shuffled_reduction);
+  auto shuffled_reduction_op = builder.create<TF::ConstOp>(getLoc(), values);
+
+  // Use new reduction indices.
+  setOperand(1, shuffled_reduction_op);
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // NegOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index b391d5284a5..e95fcbbdad3 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -172,7 +172,7 @@ else_branch: A function that takes 'inputs' and returns a list of
   }];
 }
 
-def TF_MeanOp : TF_Op<"Mean", [NoSideEffect]> {
+def TF_MeanOp : TF_Op<"Mean", [NoSideEffect, TF_FoldOperandsTransposeInterface]> {
   let summary = "Computes the mean of elements across dimensions of a tensor.";
 
   let description = [{
@@ -195,6 +195,13 @@ retained with length 1.
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
+
+  let extraClassDeclaration = [{
+    // TF_FoldOperandsTransposeInterface:
+    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
+    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {}; }
+    LogicalResult FoldOperandsPermutation(ArrayRef<int64_t> permutation);
+  }];
 }
 
 def TF_LegacyCallOp : TF_Op<"LegacyCall",
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir
index d89f5cbdf98..4e5a29dcfbe 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir
@@ -73,6 +73,25 @@ func @fold_into_max_pool(%arg0: tensor<1x64x112x112xf32>) -> tensor<1x56x56x64xf
   return %2 : tensor<1x56x56x64xf32>
 }
 
+// CHECK-LABEL: func @fold_into_mean
+func @fold_into_mean(%arg0: tensor<1x64x112x112xf32>) -> tensor<1x64xf32> {
+
+  // CHECK: %[[RED_IDX:[0-9]*]] = "tf.Const"() {value = dense<[2, 3]> : tensor<2xi64>}
+  // CHECK: %[[MEAN:[0-9]*]] = "tf.Mean"(%arg0, %[[RED_IDX]])
+  // CHECK-SAME: (tensor<1x64x112x112xf32>, tensor<2xi64>) -> tensor<1x64xf32>
+  // CHECK: return %[[MEAN]]
+
+  // Transpose NCHW -> NHWC
+  %0 = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>} : () -> tensor<4xi64>
+  %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x64x112x112xf32>, tensor<4xi64>) -> tensor<1x112x112x64xf32>
+
+  // Compute Mean over spatial dimensions in NHWC format.
+  %2 = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi64>} : () -> tensor<2xi64>
+  %3 = "tf.Mean"(%1, %2) : (tensor<1x112x112x64xf32>, tensor<2xi64>) -> tensor<1x64xf32>
+
+  return %3 : tensor<1x64xf32>
+}
+
 // CHECK-LABEL: func @fold_into_fused_batch_norm
 func @fold_into_fused_batch_norm(%arg0: tensor<1x64x112x112xf32>, %arg1: tensor<64xf32>) -> tensor<1x112x112x64xf32> {
 

From 41b6bae3d1b0c103baa331036debc92de9422a7e Mon Sep 17 00:00:00 2001
From: Blake Hechtman <blakehechtman@google.com>
Date: Thu, 20 Feb 2020 21:16:59 -0800
Subject: [PATCH 430/442] [XLA] Add some more slice of pad optimizations.

PiperOrigin-RevId: 296361878
Change-Id: I4dbef5e94d95f3337c1004e8c3f09c7a94148075
---
 .../xla/service/algebraic_simplifier.cc       | 91 ++++++++-----------
 .../xla/service/algebraic_simplifier_test.cc  | 34 ++++++-
 2 files changed, 68 insertions(+), 57 deletions(-)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index cfbcb5a4fe2..fd373671b97 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -3204,53 +3204,6 @@ StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifyScalarSlice(
     return false;
   }
 
-  if (slice->operand(0)->opcode() == HloOpcode::kPad) {
-    VLOG(10) << "Trying to simplify scalar slice of pad";
-    // Check there's no internal padding. Again, we could handle that too, since
-    // everything is statically known, but it's not worth it.
-    auto pad = Cast<HloPadInstruction>(slice->mutable_operand(0));
-    auto padding_config = pad->padding_config();
-    int64 rank = padding_config.dimensions_size();
-    if (HasInteriorPadding(padding_config)) {
-      VLOG(10) << "Not folding scalar slice of pad, pad has interior padding";
-      return false;
-    }
-
-    // Check whether the scalar we're slicing out falls into the padding.
-    bool in_padding = [&]() {
-      for (int64 i = 0; i < rank; ++i) {
-        int64 start = slice->slice_starts(i);
-        int64 low = padding_config.dimensions(i).edge_padding_low();
-        int64 data = pad->operand(0)->shape().dimensions(i);
-        if (start < low || start >= low + data) {
-          return true;
-        }
-      }
-      return false;
-    }();
-
-    if (in_padding) {
-      VLOG(10) << "Folding scalar slice of pad into padding value";
-      TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(
-          slice, HloInstruction::CreateReshape(slice->shape(),
-                                               pad->mutable_padding_value())));
-      return true;
-    } else {
-      // We already know the output of the slice is scalar. If the padded
-      // value is scalar, and it's not in the padding, then it's exactly the
-      // output value.
-      bool replaced =
-          ReplaceInstructionIfSameShape(slice, pad->mutable_operand(0));
-      if (replaced) {
-        VLOG(10) << "Folding scalar slice of pad into padded value";
-      } else {
-        VLOG(10) << "Not folding scalar slice of pad into padded value as they "
-                    "have different shapes.";
-      }
-      return replaced;
-    }
-  }
-
   if (slice->operand(0)->opcode() == HloOpcode::kConcatenate) {
     VLOG(10) << "Trying to simplify scalar slice of concat";
     // Only do this for R1, there's no chance of this being useful otherwise.
@@ -3356,20 +3309,54 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
   HloInstruction* pad;
   HloInstruction* pad_operand;
   if (Match(slice, m::Slice(m::Pad(&pad, m::Op(&pad_operand), m::Op())))) {
+    // Is the result of the slice the pad operand.
     bool slice_undoes_pad = true;
+    // Can the slice be moved to the pad_operand without any padding being read.
+    bool slice_inside_pad = true;
+    // Does this slice slice out pading only.
+    bool slice_in_padding = false;
+    std::vector<int64> new_starts = slice->slice_starts();
+    std::vector<int64> new_limits = slice->slice_limits();
     for (int64 i = 0; i < slice->shape().rank(); ++i) {
-      if (slice->slice_starts(i) !=
-          pad->padding_config().dimensions(i).edge_padding_low()) {
+      const int64 start = slice->slice_starts(i);
+      const int64 stride = slice->slice_strides(i);
+      const int64 limit = slice->slice_limits(i);
+      const int64 size = pad->shape().dimensions(i);
+
+      const auto& dim = pad->padding_config().dimensions(i);
+      const int64 low = dim.edge_padding_low();
+      const int64 high = dim.edge_padding_high();
+      const int64 interior = dim.interior_padding();
+      const int64 edge = size - high;
+
+      if (limit <= low || start >= edge) {
+        slice_in_padding = true;
+        break;
+      }
+
+      if (start != low || stride - 1 != interior) {
         slice_undoes_pad = false;
       }
-      if (slice->slice_strides(i) - 1 !=
-          pad->padding_config().dimensions(i).interior_padding()) {
-        slice_undoes_pad = false;
+
+      if (start < low || limit > edge || interior != 0 || stride != 1) {
+        slice_inside_pad = false;
       }
+      new_starts[i] -= low;
+      new_limits[i] -= low;
+    }
+    if (slice_in_padding) {
+      return ReplaceInstruction(
+          slice, MakeBroadcastHlo(pad->mutable_operand(1), {}, slice->shape()));
     }
     if (slice_undoes_pad && ReplaceInstructionIfSameShape(slice, pad_operand)) {
       return Status::OK();
     }
+    if (slice_inside_pad) {
+      TF_ASSIGN_OR_RETURN(HloInstruction * new_slice,
+                          MakeSliceHlo(pad_operand, new_starts, new_limits,
+                                       slice->slice_strides()));
+      return ReplaceInstruction(slice, new_slice);
+    }
   }
 
   if (slice->operand(0)->opcode() == HloOpcode::kSlice &&
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 8f66f8084f3..31fa125b3e1 100755
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -4389,7 +4389,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadLow) {
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, GmockMatch(m::Reshape(m::Constant())));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfPadHigh) {
@@ -4410,7 +4410,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadHigh) {
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, GmockMatch(m::Reshape(m::Constant())));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfPadMidNonScalar) {
@@ -4429,7 +4429,31 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidNonScalar) {
 
   AlgebraicSimplifierOptions options;
   AlgebraicSimplifier simplifier(options);
-  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Slice(m::Parameter(0))));
+}
+
+TEST_F(AlgebraicSimplifierTest, SliceOfPad) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param = f32[3,4] parameter(0)
+      constant = f32[] constant(0.0)
+      pad = f32[8,10] pad(f32[3,4] param, f32[] constant), padding=3_2x1_5
+      ROOT slice = f32[2,3] slice(f32[8,10] pad), slice={[4:6],[2:5]}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Slice(m::Parameter(0))));
+  EXPECT_THAT(root->slice_starts(), ElementsAre(1, 1));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalarConstant) {
@@ -4450,7 +4474,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalarConstant) {
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, GmockMatch(m::Reshape(m::Constant())));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalar) {
@@ -4494,7 +4518,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadSomeDimsInPadding) {
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, GmockMatch(m::Reshape(m::ConstantScalar(-7.0))));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::ConstantScalar(-7.0))));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfConcatScalarInput) {

From e38ef04eca773ff3c274a913eb34c351836e8b40 Mon Sep 17 00:00:00 2001
From: Dayeong Lee <dayeongl@google.com>
Date: Thu, 20 Feb 2020 21:23:59 -0800
Subject: [PATCH 431/442] Fix ProfilingListener for subclasses to override. Fix
 BenchmarkTfLiteModel to pass ProfileSummaryFormatter to ProfilingListener.

PiperOrigin-RevId: 296362673
Change-Id: I9e494202c03d8794effdf11eb1bdf1f69d62d35c
---
 .../lite/profiling/profile_summarizer.cc      |  4 ++--
 .../lite/profiling/profile_summarizer.h       |  6 ++---
 tensorflow/lite/tools/benchmark/BUILD         |  1 +
 .../tools/benchmark/benchmark_tflite_model.cc | 11 ++++++++-
 .../tools/benchmark/benchmark_tflite_model.h  |  1 -
 .../tools/benchmark/profiling_listener.cc     | 24 +++++++------------
 .../lite/tools/benchmark/profiling_listener.h | 19 ++++++++-------
 7 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/tensorflow/lite/profiling/profile_summarizer.cc b/tensorflow/lite/profiling/profile_summarizer.cc
index a4c763e4b28..acf630c93cf 100644
--- a/tensorflow/lite/profiling/profile_summarizer.cc
+++ b/tensorflow/lite/profiling/profile_summarizer.cc
@@ -89,8 +89,8 @@ OperatorDetails GetOperatorDetails(const tflite::Interpreter& interpreter,
 }  // namespace
 
 ProfileSummarizer::ProfileSummarizer(
-    std::unique_ptr<ProfileSummaryFormatter> summary_formatter)
-    : summary_formatter_(std::move(summary_formatter)) {
+    std::shared_ptr<ProfileSummaryFormatter> summary_formatter)
+    : summary_formatter_(summary_formatter) {
   // Create stats calculator for the primary graph.
   stats_calculator_map_[0] = std::unique_ptr<tensorflow::StatsCalculator>(
       new tensorflow::StatsCalculator(
diff --git a/tensorflow/lite/profiling/profile_summarizer.h b/tensorflow/lite/profiling/profile_summarizer.h
index 1348761b792..960c6ba7c3d 100644
--- a/tensorflow/lite/profiling/profile_summarizer.h
+++ b/tensorflow/lite/profiling/profile_summarizer.h
@@ -32,8 +32,8 @@ namespace profiling {
 class ProfileSummarizer {
  public:
   explicit ProfileSummarizer(
-      std::unique_ptr<ProfileSummaryFormatter> summary_formatter =
-          std::make_unique<ProfileSummaryDefaultFormatter>());
+      std::shared_ptr<ProfileSummaryFormatter> summary_formatter =
+          std::make_shared<ProfileSummaryDefaultFormatter>());
   virtual ~ProfileSummarizer() {}
 
   // Process profile events to update statistics for operator invocations.
@@ -70,7 +70,7 @@ class ProfileSummarizer {
   std::unique_ptr<tensorflow::StatsCalculator> delegate_stats_calculator_;
 
   // Summary formatter for customized output formats.
-  std::unique_ptr<ProfileSummaryFormatter> summary_formatter_;
+  std::shared_ptr<ProfileSummaryFormatter> summary_formatter_;
 };
 
 }  // namespace profiling
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index 72968fc8e24..5a413112e2f 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -118,6 +118,7 @@ cc_library(
     deps = [
         ":benchmark_model_lib",
         "//tensorflow/lite/profiling:profile_summarizer",
+        "//tensorflow/lite/profiling:profile_summary_formatter",
         "//tensorflow/lite/profiling:profiler",
     ],
 )
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 23b76a921c5..6b1e9819312 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -185,6 +185,13 @@ std::vector<int> TfLiteIntArrayToVector(const TfLiteIntArray* int_array) {
   return values;
 }
 
+std::shared_ptr<profiling::ProfileSummaryFormatter>
+CreateProfileSummaryFormatter(bool format_as_csv) {
+  return format_as_csv
+             ? std::make_shared<profiling::ProfileSummaryCSVFormatter>()
+             : std::make_shared<profiling::ProfileSummaryDefaultFormatter>();
+}
+
 }  // namespace
 
 BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
@@ -566,7 +573,9 @@ BenchmarkTfLiteModel::MayCreateProfilingListener() const {
   if (!params_.Get<bool>("enable_op_profiling")) return nullptr;
   return std::unique_ptr<BenchmarkListener>(new ProfilingListener(
       interpreter_.get(), params_.Get<int32_t>("max_profiling_buffer_entries"),
-      params_.Get<std::string>("profiling_output_csv_file")));
+      params_.Get<std::string>("profiling_output_csv_file"),
+      CreateProfileSummaryFormatter(
+          !params_.Get<std::string>("profiling_output_csv_file").empty())));
 }
 
 TfLiteStatus BenchmarkTfLiteModel::RunImpl() { return interpreter_->Invoke(); }
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
index 1d056bdf0cf..a0bcce843ab 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/profiling/profile_summary_formatter.h"
 #include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_model.h"
 
diff --git a/tensorflow/lite/tools/benchmark/profiling_listener.cc b/tensorflow/lite/tools/benchmark/profiling_listener.cc
index 8d7a0fe3537..50df69c4b7c 100644
--- a/tensorflow/lite/tools/benchmark/profiling_listener.cc
+++ b/tensorflow/lite/tools/benchmark/profiling_listener.cc
@@ -20,14 +20,15 @@ limitations under the License.
 namespace tflite {
 namespace benchmark {
 
-ProfilingListener::ProfilingListener(Interpreter* interpreter,
-                                     uint32_t max_num_entries,
-                                     const std::string& csv_file_path)
-    : interpreter_(interpreter),
-      profiler_(max_num_entries),
-      run_summarizer_(CreateProfileSummaryFormatter(!csv_file_path.empty())),
-      init_summarizer_(CreateProfileSummaryFormatter(!csv_file_path.empty())),
-      csv_file_path_(csv_file_path) {
+ProfilingListener::ProfilingListener(
+    Interpreter* interpreter, uint32_t max_num_entries,
+    const std::string& csv_file_path,
+    std::shared_ptr<profiling::ProfileSummaryFormatter> summarizer_formatter)
+    : run_summarizer_(summarizer_formatter),
+      init_summarizer_(summarizer_formatter),
+      csv_file_path_(csv_file_path),
+      interpreter_(interpreter),
+      profiler_(max_num_entries) {
   TFLITE_BENCHMARK_CHECK(interpreter);
   interpreter_->SetProfiler(&profiler_);
 
@@ -85,12 +86,5 @@ void ProfilingListener::WriteOutput(const std::string& header,
   (*stream) << data << std::endl;
 }
 
-std::unique_ptr<profiling::ProfileSummaryFormatter>
-ProfilingListener::CreateProfileSummaryFormatter(bool format_as_csv) const {
-  return format_as_csv
-             ? std::make_unique<profiling::ProfileSummaryDefaultFormatter>()
-             : std::make_unique<profiling::ProfileSummaryCSVFormatter>();
-}
-
 }  // namespace benchmark
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/profiling_listener.h b/tensorflow/lite/tools/benchmark/profiling_listener.h
index 9c0f6745bbb..0b2772baea1 100644
--- a/tensorflow/lite/tools/benchmark/profiling_listener.h
+++ b/tensorflow/lite/tools/benchmark/profiling_listener.h
@@ -16,8 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_PROFILING_LISTENER_H_
 #define TENSORFLOW_LITE_TOOLS_BENCHMARK_PROFILING_LISTENER_H_
 
+#include <memory>
+
 #include "tensorflow/lite/profiling/buffered_profiler.h"
 #include "tensorflow/lite/profiling/profile_summarizer.h"
+#include "tensorflow/lite/profiling/profile_summary_formatter.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_model.h"
 
 namespace tflite {
@@ -26,8 +29,11 @@ namespace benchmark {
 // Dumps profiling events if profiling is enabled.
 class ProfilingListener : public BenchmarkListener {
  public:
-  explicit ProfilingListener(Interpreter* interpreter, uint32_t max_num_entries,
-                             const std::string& csv_file_path = "");
+  ProfilingListener(
+      Interpreter* interpreter, uint32_t max_num_entries,
+      const std::string& csv_file_path = "",
+      std::shared_ptr<profiling::ProfileSummaryFormatter> summarizer_formatter =
+          std::make_shared<profiling::ProfileSummaryDefaultFormatter>());
 
   void OnBenchmarkStart(const BenchmarkParams& params) override;
 
@@ -38,18 +44,15 @@ class ProfilingListener : public BenchmarkListener {
   void OnBenchmarkEnd(const BenchmarkResults& results) override;
 
  protected:
-  // Allow subclasses to create a customized summary writer during init.
-  virtual std::unique_ptr<profiling::ProfileSummaryFormatter>
-  CreateProfileSummaryFormatter(bool format_as_csv) const;
+  profiling::ProfileSummarizer run_summarizer_;
+  profiling::ProfileSummarizer init_summarizer_;
+  std::string csv_file_path_;
 
  private:
   void WriteOutput(const std::string& header, const string& data,
                    std::ostream* stream);
   Interpreter* interpreter_;
   profiling::BufferedProfiler profiler_;
-  profiling::ProfileSummarizer run_summarizer_;
-  profiling::ProfileSummarizer init_summarizer_;
-  std::string csv_file_path_;
 };
 
 }  // namespace benchmark

From 98aa5d0be743aca99992c0e58fc2980b332594bb Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 20 Feb 2020 21:40:12 -0800
Subject: [PATCH 432/442] [TF:MLIR] Make Relu layout agnostic operation

PiperOrigin-RevId: 296364496
Change-Id: I86e5f2057984f85333f39f618beeda6cc862afad
---
 .../compiler/mlir/tensorflow/ir/tf_generated_ops.td    |  2 +-
 .../layout_optimization_move_transposes_begin.mlir     |  8 ++++----
 .../tests/layout_optimization_move_transposes_end.mlir | 10 +++++-----
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 77997b8002d..411ba653bec 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -4860,7 +4860,7 @@ I.e., \\(y = 1 / x\\).
   let hasCanonicalizer = 1;
 }
 
-def TF_ReluOp : TF_Op<"Relu", [NoSideEffect, SameOperandsAndResultType]> {
+def TF_ReluOp : TF_Op<"Relu", [NoSideEffect, SameOperandsAndResultType, TF_LayoutAgnostic]> {
   let summary = "Computes rectified linear: `max(features, 0)`.";
 
   let description = [{
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir
index adb9059256c..f61f1216064 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir
@@ -20,12 +20,12 @@ func @move_across_multiple_ops(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32
 
   // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
-  // CHECK: %[[TANH0:[0-9]*]] = "tf.Tanh"(%[[ARG_TRANSPOSE]]) {{.*}} tensor<1x8x4x4xf32>
-  // CHECK: %[[TANH1:[0-9]*]] = "tf.Tanh"(%[[TANH0]]) {{.*}} tensor<1x8x4x4xf32>
-  // CHECK: return %[[TANH1]]
+  // CHECK: %[[TANH:[0-9]*]] = "tf.Tanh"(%[[ARG_TRANSPOSE]]) {{.*}} tensor<1x8x4x4xf32>
+  // CHECK: %[[RELU:[0-9]*]] = "tf.Relu"(%[[TANH]]) {{.*}} tensor<1x8x4x4xf32>
+  // CHECK: return %[[RELU]]
 
   %0 = "tf.Tanh"(%arg0) : (tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
-  %1 = "tf.Tanh"(%0) : (tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
+  %1 = "tf.Relu"(%0) : (tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32>
 
   %2 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
   %3 = "tf.Transpose"(%1, %2) : (tensor<1x4x4x8xf32>, tensor<4xi64>) -> tensor<1x8x4x4xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir
index 4e5a29dcfbe..1bc61387a0d 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_end.mlir
@@ -19,15 +19,15 @@ func @move_across_single_op(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
 func @move_across_multiple_ops(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32> {
 
   // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
-  // CHECK: %[[TANH0:[0-9]*]] = "tf.Tanh"(%arg0) {{.*}} tensor<1x4x4x8xf32>
-  // CHECK: %[[TANH1:[0-9]*]] = "tf.Tanh"(%[[TANH0]]) {{.*}} tensor<1x4x4x8xf32>
-  // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[TANH1]], %[[RES_PERM]])
+  // CHECK: %[[TANH:[0-9]*]] = "tf.Tanh"(%arg0) {{.*}} tensor<1x4x4x8xf32>
+  // CHECK: %[[RELU:[0-9]*]] = "tf.Relu"(%[[TANH]]) {{.*}} tensor<1x4x4x8xf32>
+  // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[RELU]], %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
   %0 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
   %1 = "tf.Transpose"(%arg0, %0) : (tensor<1x4x4x8xf32>, tensor<4xi64>) -> tensor<1x8x4x4xf32>
   %2 = "tf.Tanh"(%1) : (tensor<1x8x4x4xf32>) -> tensor<1x8x4x4xf32>
-  %3 = "tf.Tanh"(%2) : (tensor<1x8x4x4xf32>) -> tensor<1x8x4x4xf32>
+  %3 = "tf.Relu"(%2) : (tensor<1x8x4x4xf32>) -> tensor<1x8x4x4xf32>
 
   return %3 : tensor<1x8x4x4xf32>
 }
@@ -117,4 +117,4 @@ func @fold_into_fused_batch_norm(%arg0: tensor<1x64x112x112xf32>, %arg1: tensor<
        -> (tensor<1x112x112x64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>)
 
   return %2#0 : tensor<1x112x112x64xf32>
-}
\ No newline at end of file
+}

From 41c6bf7c6215bea9bfb9bf0a9b63f2084e6f3058 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Thu, 20 Feb 2020 22:10:57 -0800
Subject: [PATCH 433/442] Avoid depending on the implementation of jit:flags in
 pywrap_tfe.

It was causing the IsXlaEnabled function to return false erroneously.

PiperOrigin-RevId: 296368921
Change-Id: I22507c7fa4bcf8804a333f4eafe38d4c009b76d2
---
 tensorflow/python/BUILD                             | 4 ++--
 tensorflow/tools/def_file_filter/symbols_pybind.txt | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 8126e9932fe..63593f1a428 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -5822,6 +5822,7 @@ filegroup(
         "//tensorflow/c:checkpoint_reader",  # checkpoint_reader
         "//tensorflow/c:python_api",  # tf_session
         "//tensorflow/c:tf_status_helper",  # tfe
+        "//tensorflow/compiler/jit:flags",  #tfe
         "//tensorflow/compiler/mlir/python:mlir",  # mlir
         "//tensorflow/core:core_cpu_base_no_ops",  # tf_session
         "//tensorflow/core:core_cpu_impl",  # device_lib
@@ -8046,6 +8047,7 @@ tf_python_pybind_extension(
         "@com_google_absl//absl/types:optional",
         "@pybind11",
         "//third_party/python_runtime:headers",
+        "//tensorflow/compiler/jit:flags_headers_only",
         "//tensorflow/core:core_cpu_headers_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -8054,13 +8056,11 @@ tf_python_pybind_extension(
         "//tensorflow/core/platform:platform",
     ] + if_static(
         extra_deps = [
-            "//tensorflow/compiler/jit:flags",
             "//tensorflow/core:eager_service_proto_cc",
             "//tensorflow/core:master_proto_cc",
             "//tensorflow/core:worker_proto_cc",
         ],
         otherwise = [
-            "//tensorflow/compiler/jit:flags_headers_only",
             "//tensorflow/core:eager_service_proto_cc_headers_only",
             "//tensorflow/core:master_proto_cc_headers_only",
             "//tensorflow/core:worker_proto_cc_headers_only",
diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
index 7bf9f560e00..1298479009b 100644
--- a/tensorflow/tools/def_file_filter/symbols_pybind.txt
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -340,3 +340,6 @@ tensorflow::grappler::AnalyticalCostEstimator::PredictCosts
 [cost_analyzer_lib] # cost_analyzer
 tensorflow::grappler::CostAnalyzer::CostAnalyzer
 tensorflow::grappler::CostAnalyzer::GenerateReport
+
+[flags] # tfe
+tensorflow::IsXlaEnabled

From bceed5cc15f4e633689987934aae7544304e1524 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 20 Feb 2020 22:46:47 -0800
Subject: [PATCH 434/442] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 296372660
Change-Id: I2725f20a3e95307c13a765565a8f055525827687
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index b97c2734a6a..aa5e42a57ed 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21345,7 +21345,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22053,7 +22053,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22249,7 +22249,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22318,7 +22318,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22433,7 +22433,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22492,7 +22492,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22666,7 +22666,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22857,7 +22857,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25297,7 +25297,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25629,7 +25629,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25679,7 +25679,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25929,7 +25929,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26559,7 +26559,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27624,7 +27624,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45570,7 +45570,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 0b86c692506b30824c77338732152ab1f0077ce7 Mon Sep 17 00:00:00 2001
From: Paul Donnelly <pauldonnelly@google.com>
Date: Thu, 20 Feb 2020 22:48:08 -0800
Subject: [PATCH 435/442] Stop the gradient for QuantizeAndDequantizeV2 when
 the input is out of range.

PiperOrigin-RevId: 296372785
Change-Id: Ia0a6168dac58a9a04183a4aa7da93cec231f5fb1
---
 tensorflow/cc/gradients/array_grad.cc         |  29 ++---
 .../api_def_QuantizeAndDequantizeV2Grad.pbtxt |   8 --
 .../api_def_QuantizeAndDequantizeV2Grad.pbtxt |   3 -
 .../api_def_QuantizeAndDequantizeV2Grad.pbtxt |   4 -
 .../kernels/quantize_and_dequantize_op.cc     | 116 ------------------
 .../core/kernels/quantize_and_dequantize_op.h |  71 -----------
 .../quantize_and_dequantize_op_gpu.cu.cc      |  40 ------
 .../quantize_and_dequantize_op_test.cc        |  48 --------
 tensorflow/core/ops/array_ops.cc              |  32 -----
 .../eager/pywrap_gradient_exclusions.cc       |   5 +-
 tensorflow/python/ops/array_grad.py           |   5 +
 tensorflow/python/ops/array_ops.py            |  17 ---
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |   4 -
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |   4 -
 14 files changed, 17 insertions(+), 369 deletions(-)
 delete mode 100644 tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
 delete mode 100644 tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
 delete mode 100644 tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt

diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc
index 3c0813bfe23..e9173227aad 100644
--- a/tensorflow/cc/gradients/array_grad.cc
+++ b/tensorflow/cc/gradients/array_grad.cc
@@ -15,12 +15,13 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/cc/framework/grad_op_registry.h"
-#include "tensorflow/cc/framework/gradients.h"
 #include "tensorflow/cc/ops/array_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
+#include "tensorflow/cc/framework/grad_op_registry.h"
+#include "tensorflow/cc/framework/gradients.h"
+
 namespace tensorflow {
 namespace ops {
 namespace {
@@ -89,25 +90,15 @@ Status QuantizeAndDequantizeGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("QuantizeAndDequantize", QuantizeAndDequantizeGrad);
 
-Status QuantizeAndDequantizeV2GradHelper(const Scope& scope,
-                                         const Operation& op,
-                                         const std::vector<Output>& grad_inputs,
-                                         std::vector<Output>* grad_outputs) {
-  Input input = Shape(scope, op.input(0));
-  Input input_min = op.input(1);
-  Input input_max = op.input(2);
-  int64 axis;
-  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "axis", &axis));
-  auto qdq_v2_grad = QuantizeAndDequantizeV2Grad(
-      scope, grad_inputs[0], input, input_min, input_max,
-      QuantizeAndDequantizeV2Grad::Axis(axis));
-  grad_outputs->push_back(qdq_v2_grad.input_backprop);
-  grad_outputs->push_back(qdq_v2_grad.input_min_backprop);
-  grad_outputs->push_back(qdq_v2_grad.input_max_backprop);
+Status QuantizeAndDequantizeV2Grad(const Scope& scope, const Operation& op,
+                                   const std::vector<Output>& grad_inputs,
+                                   std::vector<Output>* grad_outputs) {
+  grad_outputs->push_back(Identity(scope, grad_inputs[0]));
+  grad_outputs->push_back(NoGradient());
+  grad_outputs->push_back(NoGradient());
   return scope.status();
 }
-REGISTER_GRADIENT_OP("QuantizeAndDequantizeV2",
-                     QuantizeAndDequantizeV2GradHelper);
+REGISTER_GRADIENT_OP("QuantizeAndDequantizeV2", QuantizeAndDequantizeV2Grad);
 
 Status QuantizeAndDequantizeV3Grad(const Scope& scope, const Operation& op,
                                    const std::vector<Output>& grad_inputs,
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
deleted file mode 100644
index 6a7a2f38897..00000000000
--- a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
+++ /dev/null
@@ -1,8 +0,0 @@
-op {
-  graph_op_name: "QuantizeAndDequantizeV2Grad"
-  summary: "Returns the gradient of `QuantizeAndDequantizeV2`."
-  description: <<END
-Returns a gradient of 1 for inputs that are within the quantization range,
-or 0 otherwise.
-END
-}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
deleted file mode 100644
index f9f898e9c78..00000000000
--- a/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
+++ /dev/null
@@ -1,3 +0,0 @@
-op {
-  graph_op_name: "QuantizeAndDequantizeV2Grad"
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
deleted file mode 100644
index efa83a9d8e8..00000000000
--- a/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "QuantizeAndDequantizeV2Grad"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.cc b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
index 9fdf374e455..8f71d09c083 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
@@ -131,75 +131,6 @@ class QuantizeAndDequantizeV2Op : public OpKernel {
   bool narrow_range_;
 };
 
-// Implementation of QuantizeAndDequantizeV2GradientOp.
-// When back-propagating the error through a quantized layer, the following
-// paper gives evidence that clipped-ReLU is better than non-clipped:
-// "Deep Learning with Low Precision by Half-wave Gaussian Quantization"
-// http://zpascal.net/cvpr2017/Cai_Deep_Learning_With_CVPR_2017_paper.pdf
-template <typename Device, typename T>
-class QuantizeAndDequantizeV2GradientOp : public OpKernel {
- public:
-  explicit QuantizeAndDequantizeV2GradientOp(OpKernelConstruction* ctx)
-      : OpKernel::OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("axis", &axis_));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor& gradient = ctx->input(0);
-    const Tensor& input = ctx->input(1);
-    Tensor* input_backprop = nullptr;
-    OP_REQUIRES_OK(ctx,
-                   ctx->allocate_output(0, input.shape(), &input_backprop));
-
-    OP_REQUIRES(
-        ctx, input.IsSameSize(gradient),
-        errors::InvalidArgument("gradient and input must be the same size"));
-    const int depth = (axis_ == -1) ? 1 : input.dim_size(axis_);
-    const Tensor& input_min_tensor = ctx->input(2);
-    const Tensor& input_max_tensor = ctx->input(3);
-    if (axis_ != -1) {
-      OP_REQUIRES(
-          ctx, input_min_tensor.dim_size(0) == depth,
-          errors::InvalidArgument("min has incorrect size, expected ", depth,
-                                  " was ", input_min_tensor.dim_size(0)));
-      OP_REQUIRES(
-          ctx, input_max_tensor.dim_size(0) == depth,
-          errors::InvalidArgument("max has incorrect size, expected ", depth,
-                                  " was ", input_max_tensor.dim_size(0)));
-    }
-
-    TensorShape min_max_shape(input_min_tensor.shape());
-    Tensor* input_min_backprop;
-    OP_REQUIRES_OK(ctx,
-                   ctx->allocate_output(1, min_max_shape, &input_min_backprop));
-
-    Tensor* input_max_backprop;
-    OP_REQUIRES_OK(ctx,
-                   ctx->allocate_output(2, min_max_shape, &input_max_backprop));
-
-    if (axis_ == -1) {
-      functor::QuantizeAndDequantizeOneScaleGradientFunctor<Device, T> f;
-      f(ctx->eigen_device<Device>(), gradient.template flat<T>(),
-        input.template flat<T>(), input_min_tensor.scalar<T>(),
-        input_max_tensor.scalar<T>(), input_backprop->template flat<T>(),
-        input_min_backprop->template scalar<T>(),
-        input_max_backprop->template scalar<T>());
-    } else {
-      functor::QuantizeAndDequantizePerChannelGradientFunctor<Device, T> f;
-      f(ctx->eigen_device<Device>(),
-        gradient.template flat_inner_outer_dims<T, 3>(axis_ - 1),
-        input.template flat_inner_outer_dims<T, 3>(axis_ - 1),
-        &input_min_tensor, &input_max_tensor,
-        input_backprop->template flat_inner_outer_dims<T, 3>(axis_ - 1),
-        input_min_backprop->template flat<T>(),
-        input_max_backprop->template flat<T>());
-    }
-  }
-
- private:
-  int axis_;
-};
-
 // Simulate quantization precision loss in a float tensor by:
 // 1. Quantize the tensor to fixed point numbers, which should match the target
 //    quantization method when it is used in inference.
@@ -364,43 +295,6 @@ struct QuantizeAndDequantizePerChannelFunctor<CPUDevice, T> {
         input_max_tensor, round_mode, narrow_range, out);
   }
 };
-
-template <typename T>
-struct QuantizeAndDequantizeOneScaleGradientFunctor<CPUDevice, T> {
-  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat gradient,
-                  typename TTypes<T>::ConstFlat input,
-                  typename TTypes<T>::ConstScalar input_min_tensor,
-                  typename TTypes<T>::ConstScalar input_max_tensor,
-                  typename TTypes<T>::Flat input_backprop,
-                  typename TTypes<T>::Scalar input_min_backprop,
-                  typename TTypes<T>::Scalar input_max_backprop) {
-    QuantizeAndDequantizeOneScaleGradientImpl<CPUDevice, T>::Compute(
-        d, gradient, input, input_min_tensor, input_max_tensor, input_backprop,
-        input_min_backprop, input_max_backprop);
-  }
-};
-
-template <typename T>
-struct QuantizeAndDequantizePerChannelGradientFunctor<CPUDevice, T> {
-  void operator()(const CPUDevice& d,
-                  typename TTypes<T, 3>::ConstTensor gradient,
-                  typename TTypes<T, 3>::ConstTensor input,
-                  const Tensor* input_min_tensor,
-                  const Tensor* input_max_tensor,
-                  typename TTypes<T, 3>::Tensor input_backprop,
-                  typename TTypes<T>::Flat input_min_backprop,
-                  typename TTypes<T>::Flat input_max_backprop) {
-    QuantizeAndDequantizePerChannelGradientImpl<CPUDevice, T>::Compute(
-        d, gradient, input, input_min_tensor, input_max_tensor, input_backprop,
-        input_min_backprop, input_max_backprop);
-  }
-};
-
-template struct functor::QuantizeAndDequantizeOneScaleGradientFunctor<CPUDevice,
-                                                                      float>;
-template struct functor::QuantizeAndDequantizePerChannelGradientFunctor<
-    CPUDevice, double>;
-
 }  // namespace functor
 
 #define REGISTER_CPU_KERNEL(T)                                                 \
@@ -408,10 +302,6 @@ template struct functor::QuantizeAndDequantizePerChannelGradientFunctor<
                               .Device(DEVICE_CPU)                              \
                               .TypeConstraint<T>("T"),                         \
                           QuantizeAndDequantizeV2Op<CPUDevice, T>);            \
-  REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV2Grad")                  \
-                              .Device(DEVICE_CPU)                              \
-                              .TypeConstraint<T>("T"),                         \
-                          QuantizeAndDequantizeV2GradientOp<CPUDevice, T>);    \
   REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV3")                      \
                               .Device(DEVICE_CPU)                              \
                               .TypeConstraint<T>("T"),                         \
@@ -432,12 +322,6 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
                               .HostMemory("input_max")                         \
                               .TypeConstraint<T>("T"),                         \
                           QuantizeAndDequantizeV2Op<GPUDevice, T>);            \
-  REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV2Grad")                  \
-                              .Device(DEVICE_GPU)                              \
-                              .HostMemory("input_min")                         \
-                              .HostMemory("input_max")                         \
-                              .TypeConstraint<T>("T"),                         \
-                          QuantizeAndDequantizeV2GradientOp<GPUDevice, T>);    \
   REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV3")                      \
                               .Device(DEVICE_GPU)                              \
                               .HostMemory("input_min")                         \
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.h b/tensorflow/core/kernels/quantize_and_dequantize_op.h
index c286a10a9c6..4dd6e5c839b 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.h
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.h
@@ -60,28 +60,6 @@ struct QuantizeAndDequantizePerChannelFunctor {
                   typename TTypes<T, 3>::Tensor output);
 };
 
-template <typename Device, typename T>
-struct QuantizeAndDequantizeOneScaleGradientFunctor {
-  void operator()(const Device& d, typename TTypes<T>::ConstFlat gradient,
-                  typename TTypes<T>::ConstFlat input,
-                  typename TTypes<T>::ConstScalar input_min,
-                  typename TTypes<T>::ConstScalar input_max,
-                  typename TTypes<T>::Flat input_backprop,
-                  typename TTypes<T>::Scalar input_min_backprop,
-                  typename TTypes<T>::Scalar input_max_backprop);
-};
-
-template <typename Device, typename T>
-struct QuantizeAndDequantizePerChannelGradientFunctor {
-  void operator()(const Device& d, typename TTypes<T, 3>::ConstTensor gradient,
-                  typename TTypes<T, 3>::ConstTensor input,
-                  const Tensor* input_min_tensor,
-                  const Tensor* input_max_tensor,
-                  typename TTypes<T, 3>::Tensor input_backprop,
-                  typename TTypes<T>::Flat input_min_backprop,
-                  typename TTypes<T>::Flat input_max_backprop);
-};
-
 // The implementation below runs on both CPU and GPU.
 template <typename Device, typename T, typename Func,
           typename Vec = typename TTypes<T>::Vec,
@@ -271,55 +249,6 @@ struct QuantizeAndDequantizePerChannelImpl {
   }
 };
 
-template <typename Device, typename T>
-struct QuantizeAndDequantizeOneScaleGradientImpl {
-  static void Compute(const Device& d, typename TTypes<T>::ConstFlat gradient,
-                      typename TTypes<T>::ConstFlat input,
-                      typename TTypes<T>::ConstScalar input_min,
-                      typename TTypes<T>::ConstScalar input_max,
-                      typename TTypes<T>::Flat input_backprop,
-                      typename TTypes<T>::Scalar input_min_backprop,
-                      typename TTypes<T>::Scalar input_max_backprop) {
-    const T min_val = input_min();
-    const T max_val = input_max();
-    const auto in_range =
-        (input >= min_val && input <= max_val)
-            .select(input.constant(1.0f), input.constant(0.0f));
-    input_backprop.device(d) = gradient * in_range;
-    input_min_backprop.device(d) = input_min_backprop.constant(0.0f);
-    input_max_backprop.device(d) = input_max_backprop.constant(0.0f);
-  }
-};
-
-template <typename Device, typename T>
-struct QuantizeAndDequantizePerChannelGradientImpl {
-  static void Compute(const Device& d,
-                      typename TTypes<T, 3>::ConstTensor gradient,
-                      typename TTypes<T, 3>::ConstTensor input,
-                      const Tensor* input_min_tensor,
-                      const Tensor* input_max_tensor,
-                      typename TTypes<T, 3>::Tensor input_backprop,
-                      typename TTypes<T>::Flat input_min_backprop,
-                      typename TTypes<T>::Flat input_max_backprop) {
-    using Index = typename tensorflow::TTypes<T>::ConstTensor::Index;
-    auto input_min = input_min_tensor->vec<T>();
-    auto input_max = input_max_tensor->vec<T>();
-    int num_channels = input.dimension(1);
-    for (Index i = 0; i < num_channels; ++i) {
-      const auto gradient_chip = gradient.template chip<1>(i);
-      const auto input_chip = input.template chip<1>(i);
-      const T min_val = input_min(i);
-      const T max_val = input_max(i);
-      const auto in_range =
-          (input_chip >= min_val && input_chip <= max_val)
-              .select(input_chip.constant(1.0f), input_chip.constant(0.0f));
-      input_backprop.template chip<1>(i).device(d) = gradient_chip * in_range;
-    }
-    input_min_backprop.device(d) = input_min_backprop.constant(0.0f);
-    input_max_backprop.device(d) = input_max_backprop.constant(0.0f);
-  }
-};
-
 }  // end of namespace functor
 }  // end of namespace tensorflow
 
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
index 9f074535770..f3bb41071cb 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
@@ -53,37 +53,6 @@ struct QuantizeAndDequantizePerChannelFunctor<GPUDevice, T> {
   }
 };
 
-template <typename T>
-struct QuantizeAndDequantizeOneScaleGradientFunctor<GPUDevice, T> {
-  void operator()(const GPUDevice& d, typename TTypes<T>::ConstFlat gradient,
-                  typename TTypes<T>::ConstFlat input,
-                  typename TTypes<T>::ConstScalar input_min_tensor,
-                  typename TTypes<T>::ConstScalar input_max_tensor,
-                  typename TTypes<T>::Flat input_backprop,
-                  typename TTypes<T>::Scalar input_min_backprop,
-                  typename TTypes<T>::Scalar input_max_backprop) {
-    QuantizeAndDequantizeOneScaleGradientImpl<GPUDevice, T>::Compute(
-        d, gradient, input, input_min_tensor, input_max_tensor, input_backprop,
-        input_min_backprop, input_max_backprop);
-  }
-};
-
-template <typename T>
-struct QuantizeAndDequantizePerChannelGradientFunctor<GPUDevice, T> {
-  void operator()(const GPUDevice& d,
-                  typename TTypes<T, 3>::ConstTensor gradient,
-                  typename TTypes<T, 3>::ConstTensor input,
-                  const Tensor* input_min_tensor,
-                  const Tensor* input_max_tensor,
-                  typename TTypes<T, 3>::Tensor input_backprop,
-                  typename TTypes<T>::Flat input_min_backprop,
-                  typename TTypes<T>::Flat input_max_backprop) {
-    QuantizeAndDequantizePerChannelGradientImpl<GPUDevice, T>::Compute(
-        d, gradient, input, input_min_tensor, input_max_tensor, input_backprop,
-        input_min_backprop, input_max_backprop);
-  }
-};
-
 }  // end namespace functor
 
 // Instantiate the GPU implementation for float and double.
@@ -96,15 +65,6 @@ template struct functor::QuantizeAndDequantizePerChannelFunctor<GPUDevice,
 template struct functor::QuantizeAndDequantizePerChannelFunctor<GPUDevice,
                                                                 double>;
 
-template struct functor::QuantizeAndDequantizeOneScaleGradientFunctor<GPUDevice,
-                                                                      float>;
-template struct functor::QuantizeAndDequantizeOneScaleGradientFunctor<GPUDevice,
-                                                                      double>;
-template struct functor::QuantizeAndDequantizePerChannelGradientFunctor<
-    GPUDevice, float>;
-template struct functor::QuantizeAndDequantizePerChannelGradientFunctor<
-    GPUDevice, double>;
-
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index 9f8aa8e6f77..90764b0feb2 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -362,54 +362,6 @@ TEST_P(ParameterizedQuantizeAndDequantizeTest,
   }
 }
 
-// Convert a 1D tensor with signed 8 bits and round_mode half_up.
-TEST_P(ParameterizedQuantizeAndDequantizeTest, GradientV2_op) {
-  const int axis = GetParam();
-  TF_ASSERT_OK(NodeDefBuilder("qdq_v2_grad_op", "QuantizeAndDequantizeV2Grad")
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Attr("axis", axis)
-                   .Finalize(node_def()));
-  TF_ASSERT_OK(InitOp());
-  const std::vector<int64> dims = {2, 3, 4, 5};
-  // Input gradient. (repeating 11 values multiplied by (slice_idx + 1))
-  auto gradients = ScalePerSliceAlongAxis<float>(
-      dims, axis, {1, -2, -3, 4, 5, 6, -7, -8, -9, -10, 11});
-  AddInputFromArray<float>(TensorShape(dims), gradients);
-  // Forward op inputs. (repeating 7 values multiplied by (slice_idx + 1)).
-  auto inputs = ScalePerSliceAlongAxis<float>(
-      dims, axis, {-1, -0.5, 0, 0.3, 0.8, 0.55, 0.6});
-  AddInputFromArray<float>(TensorShape(dims), inputs);
-  const int num_slices = (axis == -1) ? 1 : dims[axis];
-  const TensorShape range_shape =
-      (axis == -1) ? TensorShape({}) : TensorShape({num_slices});
-  std::vector<float> input_min_values(num_slices), input_max_values(num_slices);
-  for (int i = 0; i < num_slices; ++i) {
-    input_max_values[i] = 0.8f + i * 0.4f;
-    input_min_values[i] = -input_max_values[i];
-  }
-  AddInputFromArray<float>(range_shape, input_min_values);
-  AddInputFromArray<float>(range_shape, input_max_values);
-  std::vector<float> expected_vals(inputs.size());
-  int minor_size = 1;
-  for (int i = axis + 1; i < dims.size(); ++i) {
-    minor_size *= dims[i];
-  }
-  for (int i = 0; i < inputs.size(); ++i) {
-    int slice_idx = (i / minor_size) % num_slices;
-    expected_vals[i] = ((inputs[i] >= input_min_values[slice_idx]) &&
-                        (inputs[i] <= input_max_values[slice_idx]))
-                           ? gradients[i]
-                           : 0;
-  }
-  TF_ASSERT_OK(RunOpKernel());
-  Tensor expected(allocator(), DT_FLOAT, TensorShape(dims));
-  test::FillValues<float>(&expected, expected_vals);
-  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
-}
-
 // Instantiate parameterized tests for axis = -1, 1, 3.
 INSTANTIATE_TEST_SUITE_P(All, ParameterizedQuantizeAndDequantizeTest,
                          ::testing::Values(-1, 1, 3));
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 1d4526f506e..65c9510a1f2 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -2800,38 +2800,6 @@ REGISTER_OP("QuantizeAndDequantizeV2")
       return Status::OK();
     });
 
-REGISTER_OP("QuantizeAndDequantizeV2Grad")
-    .Input("gradients: T")
-    .Input("input: T")
-    .Input("input_min: T")
-    .Input("input_max: T")
-    .Output("input_backprop: T")
-    .Output("input_min_backprop: T")
-    .Output("input_max_backprop: T")
-    .Attr("T: {bfloat16, half, float, double}")
-    .Attr("axis: int = -1")
-    .SetShapeFn([](InferenceContext* c) {
-      int axis;
-      TF_RETURN_IF_ERROR(c->GetAttr("axis", &axis));
-      const int minmax_rank = (axis == -1) ? 0 : 1;
-      ShapeHandle minmax;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), minmax_rank, &minmax));
-      TF_RETURN_IF_ERROR(c->Merge(c->input(3), minmax, &minmax));
-      if (axis != -1) {
-        ShapeHandle input;
-        TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), axis + 1, &input));
-        DimensionHandle depth;
-        TF_RETURN_IF_ERROR(
-            c->Merge(c->Dim(minmax, 0), c->Dim(input, axis), &depth));
-      }
-      ShapeHandle inputs;
-      TF_RETURN_IF_ERROR(c->Merge(c->input(0), c->input(1), &inputs));
-      c->set_output(0, inputs);
-      c->set_output(1, minmax);
-      c->set_output(2, minmax);
-      return Status::OK();
-    });
-
 REGISTER_OP("QuantizeAndDequantizeV3")
     .Input("input: T")
     .Input("input_min: T")
diff --git a/tensorflow/python/eager/pywrap_gradient_exclusions.cc b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
index c483847b969..afae0b57ee7 100644
--- a/tensorflow/python/eager/pywrap_gradient_exclusions.cc
+++ b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
@@ -224,7 +224,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
       {"PreventGradient"},
       {"Qr"},
       {"QuantizeAndDequantize"},
-      {"QuantizeAndDequantizeV2Grad", 1, {3}},
+      {"QuantizeAndDequantizeV2"},
       {"QuantizeAndDequantizeV3"},
       {"QueueClose"},
       {"QueueDequeue"},
@@ -410,7 +410,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 470> a = {{
+  static std::array<OpIndexInfo, 469> a = {{
       {"Abs"},
       {"AccumulateNV2"},
       {"Acos"},
@@ -652,7 +652,6 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
       {"Prod"},
       {"QuantizeAndDequantize"},
       {"QuantizeAndDequantizeV2"},
-      {"QuantizeAndDequantizeV2Grad"},
       {"QuantizeAndDequantizeV3"},
       {"QueueClose"},
       {"QueueEnqueue"},
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 73edd9e2b62..e1ed4443017 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -959,6 +959,11 @@ def _QuantizeAndDequantizeGrad(_, grad):
   return grad
 
 
+@ops.RegisterGradient("QuantizeAndDequantizeV2")
+def _QuantizeAndDequantizeV2Grad(_, grad):
+  return [grad, None, None]
+
+
 @ops.RegisterGradient("QuantizeAndDequantizeV3")
 def _QuantizeAndDequantizeV3Grad(_, grad):
   # Only propagate the gradient for the unquantized input.
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 4356874b4dc..4b805a64d36 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -3551,23 +3551,6 @@ def _FakeQuantWithMinMaxVarsPerChannelGradient(op, grad):
       narrow_range=op.get_attr("narrow_range"))
 
 
-@ops.RegisterGradient("QuantizeAndDequantizeV2")
-def _QuantizeAndDequantizeV2Grad(op, grad):
-  """Gradient for QuantizeAndDequantizeV2 op."""
-  return quantize_and_dequantize_v2_grad(
-      grad,
-      op.inputs[0],
-      op.inputs[1],
-      op.inputs[2],
-      axis=op.get_attr("axis"))
-
-
-@ops.RegisterGradient("QuantizeAndDequantizeV2Grad")
-def _QuantizeAndDequantizeV2GradGrad(op, grad):
-  """Gradient for QuantizeAndDequantizeV2Grad op."""
-  return _QuantizeAndDequantizeV2Grad(op, grad)
-
-
 @tf_export("required_space_to_batch_paddings")
 def required_space_to_batch_paddings(input_shape,
                                      block_shape,
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 78a2bb6f8cb..853f67c12de 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -2784,10 +2784,6 @@ tf_module {
     name: "QuantizeAndDequantizeV2"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'False\', \'-1\', \'None\'], "
   }
-  member_method {
-    name: "QuantizeAndDequantizeV2Grad"
-    argspec: "args=[\'gradients\', \'input\', \'input_min\', \'input_max\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
-  }
   member_method {
     name: "QuantizeAndDequantizeV3"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'num_bits\', \'signed_input\', \'range_given\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'False\', \'-1\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 78a2bb6f8cb..853f67c12de 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -2784,10 +2784,6 @@ tf_module {
     name: "QuantizeAndDequantizeV2"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'False\', \'-1\', \'None\'], "
   }
-  member_method {
-    name: "QuantizeAndDequantizeV2Grad"
-    argspec: "args=[\'gradients\', \'input\', \'input_min\', \'input_max\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
-  }
   member_method {
     name: "QuantizeAndDequantizeV3"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'num_bits\', \'signed_input\', \'range_given\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'False\', \'-1\', \'None\'], "

From 9028828d3b8a2a622f7203a317002cc749531695 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 20 Feb 2020 23:35:55 -0800
Subject: [PATCH 436/442] Stop the gradient for QuantizeAndDequantizeV2 when
 the input is out of range.

PiperOrigin-RevId: 296377159
Change-Id: Ib92e49c557f85c14f07f67a18814000d71ff5134
---
 tensorflow/cc/gradients/array_grad.cc         |  29 +++--
 .../api_def_QuantizeAndDequantizeV2Grad.pbtxt |   8 ++
 .../api_def_QuantizeAndDequantizeV2Grad.pbtxt |   3 +
 .../api_def_QuantizeAndDequantizeV2Grad.pbtxt |   4 +
 .../kernels/quantize_and_dequantize_op.cc     | 116 ++++++++++++++++++
 .../core/kernels/quantize_and_dequantize_op.h |  71 +++++++++++
 .../quantize_and_dequantize_op_gpu.cu.cc      |  40 ++++++
 .../quantize_and_dequantize_op_test.cc        |  48 ++++++++
 tensorflow/core/ops/array_ops.cc              |  32 +++++
 .../eager/pywrap_gradient_exclusions.cc       |   5 +-
 tensorflow/python/ops/array_grad.py           |   5 -
 tensorflow/python/ops/array_ops.py            |  17 +++
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |   4 +
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |   4 +
 14 files changed, 369 insertions(+), 17 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
 create mode 100644 tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt

diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc
index e9173227aad..3c0813bfe23 100644
--- a/tensorflow/cc/gradients/array_grad.cc
+++ b/tensorflow/cc/gradients/array_grad.cc
@@ -15,13 +15,12 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/cc/framework/grad_op_registry.h"
+#include "tensorflow/cc/framework/gradients.h"
 #include "tensorflow/cc/ops/array_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
-#include "tensorflow/cc/framework/grad_op_registry.h"
-#include "tensorflow/cc/framework/gradients.h"
-
 namespace tensorflow {
 namespace ops {
 namespace {
@@ -90,15 +89,25 @@ Status QuantizeAndDequantizeGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("QuantizeAndDequantize", QuantizeAndDequantizeGrad);
 
-Status QuantizeAndDequantizeV2Grad(const Scope& scope, const Operation& op,
-                                   const std::vector<Output>& grad_inputs,
-                                   std::vector<Output>* grad_outputs) {
-  grad_outputs->push_back(Identity(scope, grad_inputs[0]));
-  grad_outputs->push_back(NoGradient());
-  grad_outputs->push_back(NoGradient());
+Status QuantizeAndDequantizeV2GradHelper(const Scope& scope,
+                                         const Operation& op,
+                                         const std::vector<Output>& grad_inputs,
+                                         std::vector<Output>* grad_outputs) {
+  Input input = Shape(scope, op.input(0));
+  Input input_min = op.input(1);
+  Input input_max = op.input(2);
+  int64 axis;
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "axis", &axis));
+  auto qdq_v2_grad = QuantizeAndDequantizeV2Grad(
+      scope, grad_inputs[0], input, input_min, input_max,
+      QuantizeAndDequantizeV2Grad::Axis(axis));
+  grad_outputs->push_back(qdq_v2_grad.input_backprop);
+  grad_outputs->push_back(qdq_v2_grad.input_min_backprop);
+  grad_outputs->push_back(qdq_v2_grad.input_max_backprop);
   return scope.status();
 }
-REGISTER_GRADIENT_OP("QuantizeAndDequantizeV2", QuantizeAndDequantizeV2Grad);
+REGISTER_GRADIENT_OP("QuantizeAndDequantizeV2",
+                     QuantizeAndDequantizeV2GradHelper);
 
 Status QuantizeAndDequantizeV3Grad(const Scope& scope, const Operation& op,
                                    const std::vector<Output>& grad_inputs,
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
new file mode 100644
index 00000000000..6a7a2f38897
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV2Grad"
+  summary: "Returns the gradient of `QuantizeAndDequantizeV2`."
+  description: <<END
+Returns a gradient of 1 for inputs that are within the quantization range,
+or 0 otherwise.
+END
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
new file mode 100644
index 00000000000..f9f898e9c78
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV2Grad"
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
new file mode 100644
index 00000000000..efa83a9d8e8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV2Grad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV2Grad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.cc b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
index 8f71d09c083..9fdf374e455 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
@@ -131,6 +131,75 @@ class QuantizeAndDequantizeV2Op : public OpKernel {
   bool narrow_range_;
 };
 
+// Implementation of QuantizeAndDequantizeV2GradientOp.
+// When back-propagating the error through a quantized layer, the following
+// paper gives evidence that clipped-ReLU is better than non-clipped:
+// "Deep Learning with Low Precision by Half-wave Gaussian Quantization"
+// http://zpascal.net/cvpr2017/Cai_Deep_Learning_With_CVPR_2017_paper.pdf
+template <typename Device, typename T>
+class QuantizeAndDequantizeV2GradientOp : public OpKernel {
+ public:
+  explicit QuantizeAndDequantizeV2GradientOp(OpKernelConstruction* ctx)
+      : OpKernel::OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("axis", &axis_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& gradient = ctx->input(0);
+    const Tensor& input = ctx->input(1);
+    Tensor* input_backprop = nullptr;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, input.shape(), &input_backprop));
+
+    OP_REQUIRES(
+        ctx, input.IsSameSize(gradient),
+        errors::InvalidArgument("gradient and input must be the same size"));
+    const int depth = (axis_ == -1) ? 1 : input.dim_size(axis_);
+    const Tensor& input_min_tensor = ctx->input(2);
+    const Tensor& input_max_tensor = ctx->input(3);
+    if (axis_ != -1) {
+      OP_REQUIRES(
+          ctx, input_min_tensor.dim_size(0) == depth,
+          errors::InvalidArgument("min has incorrect size, expected ", depth,
+                                  " was ", input_min_tensor.dim_size(0)));
+      OP_REQUIRES(
+          ctx, input_max_tensor.dim_size(0) == depth,
+          errors::InvalidArgument("max has incorrect size, expected ", depth,
+                                  " was ", input_max_tensor.dim_size(0)));
+    }
+
+    TensorShape min_max_shape(input_min_tensor.shape());
+    Tensor* input_min_backprop;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(1, min_max_shape, &input_min_backprop));
+
+    Tensor* input_max_backprop;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(2, min_max_shape, &input_max_backprop));
+
+    if (axis_ == -1) {
+      functor::QuantizeAndDequantizeOneScaleGradientFunctor<Device, T> f;
+      f(ctx->eigen_device<Device>(), gradient.template flat<T>(),
+        input.template flat<T>(), input_min_tensor.scalar<T>(),
+        input_max_tensor.scalar<T>(), input_backprop->template flat<T>(),
+        input_min_backprop->template scalar<T>(),
+        input_max_backprop->template scalar<T>());
+    } else {
+      functor::QuantizeAndDequantizePerChannelGradientFunctor<Device, T> f;
+      f(ctx->eigen_device<Device>(),
+        gradient.template flat_inner_outer_dims<T, 3>(axis_ - 1),
+        input.template flat_inner_outer_dims<T, 3>(axis_ - 1),
+        &input_min_tensor, &input_max_tensor,
+        input_backprop->template flat_inner_outer_dims<T, 3>(axis_ - 1),
+        input_min_backprop->template flat<T>(),
+        input_max_backprop->template flat<T>());
+    }
+  }
+
+ private:
+  int axis_;
+};
+
 // Simulate quantization precision loss in a float tensor by:
 // 1. Quantize the tensor to fixed point numbers, which should match the target
 //    quantization method when it is used in inference.
@@ -295,6 +364,43 @@ struct QuantizeAndDequantizePerChannelFunctor<CPUDevice, T> {
         input_max_tensor, round_mode, narrow_range, out);
   }
 };
+
+template <typename T>
+struct QuantizeAndDequantizeOneScaleGradientFunctor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat gradient,
+                  typename TTypes<T>::ConstFlat input,
+                  typename TTypes<T>::ConstScalar input_min_tensor,
+                  typename TTypes<T>::ConstScalar input_max_tensor,
+                  typename TTypes<T>::Flat input_backprop,
+                  typename TTypes<T>::Scalar input_min_backprop,
+                  typename TTypes<T>::Scalar input_max_backprop) {
+    QuantizeAndDequantizeOneScaleGradientImpl<CPUDevice, T>::Compute(
+        d, gradient, input, input_min_tensor, input_max_tensor, input_backprop,
+        input_min_backprop, input_max_backprop);
+  }
+};
+
+template <typename T>
+struct QuantizeAndDequantizePerChannelGradientFunctor<CPUDevice, T> {
+  void operator()(const CPUDevice& d,
+                  typename TTypes<T, 3>::ConstTensor gradient,
+                  typename TTypes<T, 3>::ConstTensor input,
+                  const Tensor* input_min_tensor,
+                  const Tensor* input_max_tensor,
+                  typename TTypes<T, 3>::Tensor input_backprop,
+                  typename TTypes<T>::Flat input_min_backprop,
+                  typename TTypes<T>::Flat input_max_backprop) {
+    QuantizeAndDequantizePerChannelGradientImpl<CPUDevice, T>::Compute(
+        d, gradient, input, input_min_tensor, input_max_tensor, input_backprop,
+        input_min_backprop, input_max_backprop);
+  }
+};
+
+template struct functor::QuantizeAndDequantizeOneScaleGradientFunctor<CPUDevice,
+                                                                      float>;
+template struct functor::QuantizeAndDequantizePerChannelGradientFunctor<
+    CPUDevice, double>;
+
 }  // namespace functor
 
 #define REGISTER_CPU_KERNEL(T)                                                 \
@@ -302,6 +408,10 @@ struct QuantizeAndDequantizePerChannelFunctor<CPUDevice, T> {
                               .Device(DEVICE_CPU)                              \
                               .TypeConstraint<T>("T"),                         \
                           QuantizeAndDequantizeV2Op<CPUDevice, T>);            \
+  REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV2Grad")                  \
+                              .Device(DEVICE_CPU)                              \
+                              .TypeConstraint<T>("T"),                         \
+                          QuantizeAndDequantizeV2GradientOp<CPUDevice, T>);    \
   REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV3")                      \
                               .Device(DEVICE_CPU)                              \
                               .TypeConstraint<T>("T"),                         \
@@ -322,6 +432,12 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
                               .HostMemory("input_max")                         \
                               .TypeConstraint<T>("T"),                         \
                           QuantizeAndDequantizeV2Op<GPUDevice, T>);            \
+  REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV2Grad")                  \
+                              .Device(DEVICE_GPU)                              \
+                              .HostMemory("input_min")                         \
+                              .HostMemory("input_max")                         \
+                              .TypeConstraint<T>("T"),                         \
+                          QuantizeAndDequantizeV2GradientOp<GPUDevice, T>);    \
   REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV3")                      \
                               .Device(DEVICE_GPU)                              \
                               .HostMemory("input_min")                         \
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.h b/tensorflow/core/kernels/quantize_and_dequantize_op.h
index 4dd6e5c839b..c286a10a9c6 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.h
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.h
@@ -60,6 +60,28 @@ struct QuantizeAndDequantizePerChannelFunctor {
                   typename TTypes<T, 3>::Tensor output);
 };
 
+template <typename Device, typename T>
+struct QuantizeAndDequantizeOneScaleGradientFunctor {
+  void operator()(const Device& d, typename TTypes<T>::ConstFlat gradient,
+                  typename TTypes<T>::ConstFlat input,
+                  typename TTypes<T>::ConstScalar input_min,
+                  typename TTypes<T>::ConstScalar input_max,
+                  typename TTypes<T>::Flat input_backprop,
+                  typename TTypes<T>::Scalar input_min_backprop,
+                  typename TTypes<T>::Scalar input_max_backprop);
+};
+
+template <typename Device, typename T>
+struct QuantizeAndDequantizePerChannelGradientFunctor {
+  void operator()(const Device& d, typename TTypes<T, 3>::ConstTensor gradient,
+                  typename TTypes<T, 3>::ConstTensor input,
+                  const Tensor* input_min_tensor,
+                  const Tensor* input_max_tensor,
+                  typename TTypes<T, 3>::Tensor input_backprop,
+                  typename TTypes<T>::Flat input_min_backprop,
+                  typename TTypes<T>::Flat input_max_backprop);
+};
+
 // The implementation below runs on both CPU and GPU.
 template <typename Device, typename T, typename Func,
           typename Vec = typename TTypes<T>::Vec,
@@ -249,6 +271,55 @@ struct QuantizeAndDequantizePerChannelImpl {
   }
 };
 
+template <typename Device, typename T>
+struct QuantizeAndDequantizeOneScaleGradientImpl {
+  static void Compute(const Device& d, typename TTypes<T>::ConstFlat gradient,
+                      typename TTypes<T>::ConstFlat input,
+                      typename TTypes<T>::ConstScalar input_min,
+                      typename TTypes<T>::ConstScalar input_max,
+                      typename TTypes<T>::Flat input_backprop,
+                      typename TTypes<T>::Scalar input_min_backprop,
+                      typename TTypes<T>::Scalar input_max_backprop) {
+    const T min_val = input_min();
+    const T max_val = input_max();
+    const auto in_range =
+        (input >= min_val && input <= max_val)
+            .select(input.constant(1.0f), input.constant(0.0f));
+    input_backprop.device(d) = gradient * in_range;
+    input_min_backprop.device(d) = input_min_backprop.constant(0.0f);
+    input_max_backprop.device(d) = input_max_backprop.constant(0.0f);
+  }
+};
+
+template <typename Device, typename T>
+struct QuantizeAndDequantizePerChannelGradientImpl {
+  static void Compute(const Device& d,
+                      typename TTypes<T, 3>::ConstTensor gradient,
+                      typename TTypes<T, 3>::ConstTensor input,
+                      const Tensor* input_min_tensor,
+                      const Tensor* input_max_tensor,
+                      typename TTypes<T, 3>::Tensor input_backprop,
+                      typename TTypes<T>::Flat input_min_backprop,
+                      typename TTypes<T>::Flat input_max_backprop) {
+    using Index = typename tensorflow::TTypes<T>::ConstTensor::Index;
+    auto input_min = input_min_tensor->vec<T>();
+    auto input_max = input_max_tensor->vec<T>();
+    int num_channels = input.dimension(1);
+    for (Index i = 0; i < num_channels; ++i) {
+      const auto gradient_chip = gradient.template chip<1>(i);
+      const auto input_chip = input.template chip<1>(i);
+      const T min_val = input_min(i);
+      const T max_val = input_max(i);
+      const auto in_range =
+          (input_chip >= min_val && input_chip <= max_val)
+              .select(input_chip.constant(1.0f), input_chip.constant(0.0f));
+      input_backprop.template chip<1>(i).device(d) = gradient_chip * in_range;
+    }
+    input_min_backprop.device(d) = input_min_backprop.constant(0.0f);
+    input_max_backprop.device(d) = input_max_backprop.constant(0.0f);
+  }
+};
+
 }  // end of namespace functor
 }  // end of namespace tensorflow
 
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
index f3bb41071cb..9f074535770 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
@@ -53,6 +53,37 @@ struct QuantizeAndDequantizePerChannelFunctor<GPUDevice, T> {
   }
 };
 
+template <typename T>
+struct QuantizeAndDequantizeOneScaleGradientFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::ConstFlat gradient,
+                  typename TTypes<T>::ConstFlat input,
+                  typename TTypes<T>::ConstScalar input_min_tensor,
+                  typename TTypes<T>::ConstScalar input_max_tensor,
+                  typename TTypes<T>::Flat input_backprop,
+                  typename TTypes<T>::Scalar input_min_backprop,
+                  typename TTypes<T>::Scalar input_max_backprop) {
+    QuantizeAndDequantizeOneScaleGradientImpl<GPUDevice, T>::Compute(
+        d, gradient, input, input_min_tensor, input_max_tensor, input_backprop,
+        input_min_backprop, input_max_backprop);
+  }
+};
+
+template <typename T>
+struct QuantizeAndDequantizePerChannelGradientFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d,
+                  typename TTypes<T, 3>::ConstTensor gradient,
+                  typename TTypes<T, 3>::ConstTensor input,
+                  const Tensor* input_min_tensor,
+                  const Tensor* input_max_tensor,
+                  typename TTypes<T, 3>::Tensor input_backprop,
+                  typename TTypes<T>::Flat input_min_backprop,
+                  typename TTypes<T>::Flat input_max_backprop) {
+    QuantizeAndDequantizePerChannelGradientImpl<GPUDevice, T>::Compute(
+        d, gradient, input, input_min_tensor, input_max_tensor, input_backprop,
+        input_min_backprop, input_max_backprop);
+  }
+};
+
 }  // end namespace functor
 
 // Instantiate the GPU implementation for float and double.
@@ -65,6 +96,15 @@ template struct functor::QuantizeAndDequantizePerChannelFunctor<GPUDevice,
 template struct functor::QuantizeAndDequantizePerChannelFunctor<GPUDevice,
                                                                 double>;
 
+template struct functor::QuantizeAndDequantizeOneScaleGradientFunctor<GPUDevice,
+                                                                      float>;
+template struct functor::QuantizeAndDequantizeOneScaleGradientFunctor<GPUDevice,
+                                                                      double>;
+template struct functor::QuantizeAndDequantizePerChannelGradientFunctor<
+    GPUDevice, float>;
+template struct functor::QuantizeAndDequantizePerChannelGradientFunctor<
+    GPUDevice, double>;
+
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index 90764b0feb2..9f8aa8e6f77 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -362,6 +362,54 @@ TEST_P(ParameterizedQuantizeAndDequantizeTest,
   }
 }
 
+// Convert a 1D tensor with signed 8 bits and round_mode half_up.
+TEST_P(ParameterizedQuantizeAndDequantizeTest, GradientV2_op) {
+  const int axis = GetParam();
+  TF_ASSERT_OK(NodeDefBuilder("qdq_v2_grad_op", "QuantizeAndDequantizeV2Grad")
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("axis", axis)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const std::vector<int64> dims = {2, 3, 4, 5};
+  // Input gradient. (repeating 11 values multiplied by (slice_idx + 1))
+  auto gradients = ScalePerSliceAlongAxis<float>(
+      dims, axis, {1, -2, -3, 4, 5, 6, -7, -8, -9, -10, 11});
+  AddInputFromArray<float>(TensorShape(dims), gradients);
+  // Forward op inputs. (repeating 7 values multiplied by (slice_idx + 1)).
+  auto inputs = ScalePerSliceAlongAxis<float>(
+      dims, axis, {-1, -0.5, 0, 0.3, 0.8, 0.55, 0.6});
+  AddInputFromArray<float>(TensorShape(dims), inputs);
+  const int num_slices = (axis == -1) ? 1 : dims[axis];
+  const TensorShape range_shape =
+      (axis == -1) ? TensorShape({}) : TensorShape({num_slices});
+  std::vector<float> input_min_values(num_slices), input_max_values(num_slices);
+  for (int i = 0; i < num_slices; ++i) {
+    input_max_values[i] = 0.8f + i * 0.4f;
+    input_min_values[i] = -input_max_values[i];
+  }
+  AddInputFromArray<float>(range_shape, input_min_values);
+  AddInputFromArray<float>(range_shape, input_max_values);
+  std::vector<float> expected_vals(inputs.size());
+  int minor_size = 1;
+  for (int i = axis + 1; i < dims.size(); ++i) {
+    minor_size *= dims[i];
+  }
+  for (int i = 0; i < inputs.size(); ++i) {
+    int slice_idx = (i / minor_size) % num_slices;
+    expected_vals[i] = ((inputs[i] >= input_min_values[slice_idx]) &&
+                        (inputs[i] <= input_max_values[slice_idx]))
+                           ? gradients[i]
+                           : 0;
+  }
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape(dims));
+  test::FillValues<float>(&expected, expected_vals);
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+}
+
 // Instantiate parameterized tests for axis = -1, 1, 3.
 INSTANTIATE_TEST_SUITE_P(All, ParameterizedQuantizeAndDequantizeTest,
                          ::testing::Values(-1, 1, 3));
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 65c9510a1f2..1d4526f506e 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -2800,6 +2800,38 @@ REGISTER_OP("QuantizeAndDequantizeV2")
       return Status::OK();
     });
 
+REGISTER_OP("QuantizeAndDequantizeV2Grad")
+    .Input("gradients: T")
+    .Input("input: T")
+    .Input("input_min: T")
+    .Input("input_max: T")
+    .Output("input_backprop: T")
+    .Output("input_min_backprop: T")
+    .Output("input_max_backprop: T")
+    .Attr("T: {bfloat16, half, float, double}")
+    .Attr("axis: int = -1")
+    .SetShapeFn([](InferenceContext* c) {
+      int axis;
+      TF_RETURN_IF_ERROR(c->GetAttr("axis", &axis));
+      const int minmax_rank = (axis == -1) ? 0 : 1;
+      ShapeHandle minmax;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), minmax_rank, &minmax));
+      TF_RETURN_IF_ERROR(c->Merge(c->input(3), minmax, &minmax));
+      if (axis != -1) {
+        ShapeHandle input;
+        TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), axis + 1, &input));
+        DimensionHandle depth;
+        TF_RETURN_IF_ERROR(
+            c->Merge(c->Dim(minmax, 0), c->Dim(input, axis), &depth));
+      }
+      ShapeHandle inputs;
+      TF_RETURN_IF_ERROR(c->Merge(c->input(0), c->input(1), &inputs));
+      c->set_output(0, inputs);
+      c->set_output(1, minmax);
+      c->set_output(2, minmax);
+      return Status::OK();
+    });
+
 REGISTER_OP("QuantizeAndDequantizeV3")
     .Input("input: T")
     .Input("input_min: T")
diff --git a/tensorflow/python/eager/pywrap_gradient_exclusions.cc b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
index afae0b57ee7..c483847b969 100644
--- a/tensorflow/python/eager/pywrap_gradient_exclusions.cc
+++ b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
@@ -224,7 +224,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
       {"PreventGradient"},
       {"Qr"},
       {"QuantizeAndDequantize"},
-      {"QuantizeAndDequantizeV2"},
+      {"QuantizeAndDequantizeV2Grad", 1, {3}},
       {"QuantizeAndDequantizeV3"},
       {"QueueClose"},
       {"QueueDequeue"},
@@ -410,7 +410,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 469> a = {{
+  static std::array<OpIndexInfo, 470> a = {{
       {"Abs"},
       {"AccumulateNV2"},
       {"Acos"},
@@ -652,6 +652,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
       {"Prod"},
       {"QuantizeAndDequantize"},
       {"QuantizeAndDequantizeV2"},
+      {"QuantizeAndDequantizeV2Grad"},
       {"QuantizeAndDequantizeV3"},
       {"QueueClose"},
       {"QueueEnqueue"},
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index e1ed4443017..73edd9e2b62 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -959,11 +959,6 @@ def _QuantizeAndDequantizeGrad(_, grad):
   return grad
 
 
-@ops.RegisterGradient("QuantizeAndDequantizeV2")
-def _QuantizeAndDequantizeV2Grad(_, grad):
-  return [grad, None, None]
-
-
 @ops.RegisterGradient("QuantizeAndDequantizeV3")
 def _QuantizeAndDequantizeV3Grad(_, grad):
   # Only propagate the gradient for the unquantized input.
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 4b805a64d36..4356874b4dc 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -3551,6 +3551,23 @@ def _FakeQuantWithMinMaxVarsPerChannelGradient(op, grad):
       narrow_range=op.get_attr("narrow_range"))
 
 
+@ops.RegisterGradient("QuantizeAndDequantizeV2")
+def _QuantizeAndDequantizeV2Grad(op, grad):
+  """Gradient for QuantizeAndDequantizeV2 op."""
+  return quantize_and_dequantize_v2_grad(
+      grad,
+      op.inputs[0],
+      op.inputs[1],
+      op.inputs[2],
+      axis=op.get_attr("axis"))
+
+
+@ops.RegisterGradient("QuantizeAndDequantizeV2Grad")
+def _QuantizeAndDequantizeV2GradGrad(op, grad):
+  """Gradient for QuantizeAndDequantizeV2Grad op."""
+  return _QuantizeAndDequantizeV2Grad(op, grad)
+
+
 @tf_export("required_space_to_batch_paddings")
 def required_space_to_batch_paddings(input_shape,
                                      block_shape,
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 853f67c12de..78a2bb6f8cb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -2784,6 +2784,10 @@ tf_module {
     name: "QuantizeAndDequantizeV2"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'False\', \'-1\', \'None\'], "
   }
+  member_method {
+    name: "QuantizeAndDequantizeV2Grad"
+    argspec: "args=[\'gradients\', \'input\', \'input_min\', \'input_max\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
   member_method {
     name: "QuantizeAndDequantizeV3"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'num_bits\', \'signed_input\', \'range_given\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'False\', \'-1\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 853f67c12de..78a2bb6f8cb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -2784,6 +2784,10 @@ tf_module {
     name: "QuantizeAndDequantizeV2"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'False\', \'-1\', \'None\'], "
   }
+  member_method {
+    name: "QuantizeAndDequantizeV2Grad"
+    argspec: "args=[\'gradients\', \'input\', \'input_min\', \'input_max\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
   member_method {
     name: "QuantizeAndDequantizeV3"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'num_bits\', \'signed_input\', \'range_given\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'False\', \'-1\', \'None\'], "

From 4a6cafed1d5da04b33b94d827249c640f47e26fa Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 21 Feb 2020 00:46:46 -0800
Subject: [PATCH 437/442] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 296385138
Change-Id: I7c823614efbc51b718b4435426040fa369146174
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index aa5e42a57ed..b97c2734a6a 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11611,7 +11611,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -11868,7 +11868,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -11879,7 +11879,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -12085,7 +12085,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12096,7 +12096,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18937,7 +18937,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20077,7 +20077,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21345,7 +21345,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22053,7 +22053,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22249,7 +22249,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22318,7 +22318,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22433,7 +22433,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22492,7 +22492,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22666,7 +22666,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22857,7 +22857,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25297,7 +25297,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25629,7 +25629,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25679,7 +25679,7 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25929,7 +25929,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26559,7 +26559,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27624,7 +27624,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45570,7 +45570,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From aeddd93b419d88d7134cab5f59fdca58b6b31cc3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 21 Feb 2020 01:02:26 -0800
Subject: [PATCH 438/442] compat: Update forward compatibility horizon to
 2020-02-21

PiperOrigin-RevId: 296386877
Change-Id: I121078b4d184fcd042a91f0a10c231e313223b03
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index e4638ead571..44dea5069d5 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -31,7 +31,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 2, 20)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 2, 21)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From a2bfd7807b2f1e0b6476346d1e3026f437999714 Mon Sep 17 00:00:00 2001
From: Jakob Buchgraber <buchgr@google.com>
Date: Fri, 21 Feb 2020 01:55:27 -0800
Subject: [PATCH 439/442] cleanup: delete unused ubuntu14.04 and centos6
 preconfigs

PiperOrigin-RevId: 296393046
Change-Id: I7e18e90860d8a1bd072956efe7dcfd4cab6af1bf
---
 tensorflow/opensource_only.files              |   22 -
 .../centos6/cuda10.0-cudnn7/WORKSPACE         |    2 -
 .../centos6/cuda10.0-cudnn7/cuda/BUILD        | 1283 ------------
 .../cuda10.0-cudnn7/cuda/build_defs.bzl       |   64 -
 .../cuda10.0-cudnn7/cuda/cuda/cuda_config.h   |   27 -
 .../centos6/cuda10.1-cudnn7/WORKSPACE         |    2 -
 .../centos6/cuda10.1-cudnn7/cuda/BUILD        | 1357 -------------
 .../cuda10.1-cudnn7/cuda/build_defs.bzl       |   65 -
 .../cuda10.1-cudnn7/cuda/cuda/cuda_config.h   |   27 -
 .../centos6/gcc7-nvcc-cuda10.0/BUILD          |  170 --
 .../centos6/gcc7-nvcc-cuda10.0/CROSSTOOL      | 1429 --------------
 .../cc_toolchain_config.bzl                   | 1486 --------------
 .../bin/crosstool_wrapper_driver_is_not_gcc   |  267 ---
 .../windows/msvc_wrapper_for_nvcc.py          |  192 --
 .../centos6/gcc7-nvcc-cuda10.1/BUILD          |  170 --
 .../cc_toolchain_config.bzl                   | 1486 --------------
 .../bin/crosstool_wrapper_driver_is_not_gcc   |  280 ---
 .../windows/msvc_wrapper_for_nvcc.py          |  207 --
 .../toolchains/preconfig/centos6/gcc7/BUILD   |  121 --
 .../preconfig/centos6/gcc7/WORKSPACE          |    2 -
 .../centos6/gcc7/cc_toolchain_config.bzl      | 1734 -----------------
 .../preconfig/centos6/gcc7/cc_wrapper.sh      |   25 -
 .../centos6/gcc7/dummy_toolchain.bzl          |   23 -
 .../preconfig/centos6/gcc7/tools/cpp/empty.cc |    1 -
 .../toolchains/preconfig/centos6/py/BUILD     |  174 --
 .../toolchains/preconfig/centos6/py/WORKSPACE |    2 -
 .../toolchains/preconfig/centos6/py3/BUILD    |  181 --
 .../preconfig/centos6/py3/WORKSPACE           |    2 -
 .../preconfig/centos6/tensorrt5/BUILD         |   62 -
 .../preconfig/centos6/tensorrt5/LICENSE       |    0
 .../preconfig/centos6/tensorrt5/WORKSPACE     |    2 -
 .../centos6/tensorrt5/build_defs.bzl          |    5 -
 .../tensorrt/include/tensorrt_config.h        |   21 -
 .../ubuntu14.04/cuda10.0-cudnn7/WORKSPACE     |    2 -
 .../ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD    | 1282 ------------
 .../cuda10.0-cudnn7/cuda/build_defs.bzl       |   64 -
 .../cuda10.0-cudnn7/cuda/cuda/cuda_config.h   |   27 -
 .../ubuntu14.04/gcc-nvcc-cuda10.0/BUILD       |  172 --
 .../gcc-nvcc-cuda10.0/cc_toolchain_config.bzl | 1485 --------------
 .../bin/crosstool_wrapper_driver_is_not_gcc   |  264 ---
 .../windows/msvc_wrapper_for_nvcc.bat         |   20 -
 .../windows/msvc_wrapper_for_nvcc.py          |  192 --
 .../preconfig/ubuntu14.04/py3/BUILD           |  176 --
 .../preconfig/ubuntu14.04/py3/WORKSPACE       |    2 -
 .../preconfig/ubuntu14.04/tensorrt5/BUILD     |   63 -
 .../preconfig/ubuntu14.04/tensorrt5/LICENSE   |    0
 .../preconfig/ubuntu14.04/tensorrt5/WORKSPACE |    2 -
 .../ubuntu14.04/tensorrt5/build_defs.bzl      |    5 -
 .../tensorrt/include/tensorrt_config.h        |   21 -
 49 files changed, 14668 deletions(-)
 delete mode 100644 third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/WORKSPACE
 delete mode 100755 third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/BUILD
 delete mode 100755 third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/build_defs.bzl
 delete mode 100755 third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
 delete mode 100644 third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/WORKSPACE
 delete mode 100755 third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/BUILD
 delete mode 100755 third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/build_defs.bzl
 delete mode 100755 third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/cuda/cuda_config.h
 delete mode 100755 third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/BUILD
 delete mode 100755 third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/CROSSTOOL
 delete mode 100755 third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/cc_toolchain_config.bzl
 delete mode 100755 third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
 delete mode 100755 third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
 delete mode 100755 third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/BUILD
 delete mode 100755 third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/cc_toolchain_config.bzl
 delete mode 100755 third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/clang/bin/crosstool_wrapper_driver_is_not_gcc
 delete mode 100755 third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/windows/msvc_wrapper_for_nvcc.py
 delete mode 100755 third_party/toolchains/preconfig/centos6/gcc7/BUILD
 delete mode 100644 third_party/toolchains/preconfig/centos6/gcc7/WORKSPACE
 delete mode 100755 third_party/toolchains/preconfig/centos6/gcc7/cc_toolchain_config.bzl
 delete mode 100755 third_party/toolchains/preconfig/centos6/gcc7/cc_wrapper.sh
 delete mode 100755 third_party/toolchains/preconfig/centos6/gcc7/dummy_toolchain.bzl
 delete mode 100755 third_party/toolchains/preconfig/centos6/gcc7/tools/cpp/empty.cc
 delete mode 100755 third_party/toolchains/preconfig/centos6/py/BUILD
 delete mode 100644 third_party/toolchains/preconfig/centos6/py/WORKSPACE
 delete mode 100755 third_party/toolchains/preconfig/centos6/py3/BUILD
 delete mode 100644 third_party/toolchains/preconfig/centos6/py3/WORKSPACE
 delete mode 100755 third_party/toolchains/preconfig/centos6/tensorrt5/BUILD
 delete mode 100644 third_party/toolchains/preconfig/centos6/tensorrt5/LICENSE
 delete mode 100644 third_party/toolchains/preconfig/centos6/tensorrt5/WORKSPACE
 delete mode 100755 third_party/toolchains/preconfig/centos6/tensorrt5/build_defs.bzl
 delete mode 100644 third_party/toolchains/preconfig/centos6/tensorrt5/tensorrt/include/tensorrt_config.h
 delete mode 100644 third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/WORKSPACE
 delete mode 100755 third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
 delete mode 100755 third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl
 delete mode 100644 third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
 delete mode 100755 third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
 delete mode 100755 third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/cc_toolchain_config.bzl
 delete mode 100755 third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
 delete mode 100755 third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.bat
 delete mode 100755 third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
 delete mode 100755 third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
 delete mode 100644 third_party/toolchains/preconfig/ubuntu14.04/py3/WORKSPACE
 delete mode 100755 third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
 delete mode 100644 third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/LICENSE
 delete mode 100644 third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/WORKSPACE
 delete mode 100755 third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl
 delete mode 100644 third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/tensorrt/include/tensorrt_config.h

diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index bba10464933..ae6bf2a6566 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -215,33 +215,11 @@ tensorflow/third_party/toolchains/cpus/arm/cc_config.bzl.tpl
 tensorflow/third_party/toolchains/cpus/py/BUILD
 tensorflow/third_party/toolchains/cpus/py3/BUILD
 tensorflow/third_party/toolchains/java/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/build_defs.bzl
-tensorflow/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/build_defs.bzl
-tensorflow/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/cc_toolchain_config.bzl
-tensorflow/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/cc_toolchain_config.bzl
-tensorflow/third_party/toolchains/preconfig/centos6/gcc7/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/gcc7/cc_toolchain_config.bzl
-tensorflow/third_party/toolchains/preconfig/centos6/gcc7/dummy_toolchain.bzl
-tensorflow/third_party/toolchains/preconfig/centos6/py/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/py3/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/tensorrt5/BUILD
-tensorflow/third_party/toolchains/preconfig/centos6/tensorrt5/build_defs.bzl
 tensorflow/third_party/toolchains/preconfig/generate/BUILD
 tensorflow/third_party/toolchains/preconfig/generate/archives.bzl
 tensorflow/third_party/toolchains/preconfig/generate/containers.bzl
 tensorflow/third_party/toolchains/preconfig/generate/generate.bzl
 tensorflow/third_party/toolchains/preconfig/generate/workspace.bzl
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/cc_toolchain_config.bzl
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/cc_toolchain_config.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/dummy_toolchain.bzl
diff --git a/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/WORKSPACE b/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/WORKSPACE
deleted file mode 100644
index b61f572d6d2..00000000000
--- a/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for cuda_configure rule
-workspace(name = "local_config_cuda")
diff --git a/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/BUILD b/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/BUILD
deleted file mode 100755
index 3a3421001bb..00000000000
--- a/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/BUILD
+++ /dev/null
@@ -1,1283 +0,0 @@
-load(":build_defs.bzl", "cuda_header_library")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
-
-package(default_visibility = ["//visibility:public"])
-
-config_setting(
-    name = "using_nvcc",
-    values = {
-        "define": "using_cuda_nvcc=true",
-    },
-)
-
-config_setting(
-    name = "using_clang",
-    values = {
-        "define": "using_cuda_clang=true",
-    },
-)
-
-# Equivalent to using_clang && -c opt.
-config_setting(
-    name = "using_clang_opt",
-    values = {
-        "define": "using_cuda_clang=true",
-        "compilation_mode": "opt",
-    },
-)
-
-config_setting(
-    name = "darwin",
-    values = {"cpu": "darwin"},
-)
-
-config_setting(
-    name = "freebsd",
-    values = {"cpu": "freebsd"},
-)
-
-cuda_header_library(
-    name = "cuda_headers",
-    hdrs = [
-        "cuda/cuda_config.h",
-        ":cuda-include",
-    ],
-    include_prefix = "third_party/gpus",
-    includes = [
-        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
-        "cuda/include",
-    ],
-)
-
-cc_library(
-    name = "cudart_static",
-    srcs = ["cuda/lib/libcudart_static.a"],
-    linkopts = select({
-        ":freebsd": [],
-        "//conditions:default": ["-ldl"],
-    }) + [
-        "-lpthread",
-        "-lrt",
-    ],
-)
-
-cc_library(
-    name = "cuda_driver",
-    srcs = ["cuda/lib/libcuda.so"],
-)
-
-cc_library(
-    name = "cudart",
-    srcs = ["cuda/lib/libcudart.so.10.0"],
-    data = ["cuda/lib/libcudart.so.10.0"],
-    linkstatic = 1,
-)
-
-cuda_header_library(
-    name = "cublas_headers",
-    hdrs = [":cublas-include"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["cublas/include"],
-    strip_include_prefix = "cublas/include",
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cublas",
-    srcs = ["cuda/lib/libcublas.so.10.0"],
-    data = ["cuda/lib/libcublas.so.10.0"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cusolver",
-    srcs = ["cuda/lib/libcusolver.so.10.0"],
-    data = ["cuda/lib/libcusolver.so.10.0"],
-    linkopts = ["-lgomp"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cudnn",
-    srcs = ["cuda/lib/libcudnn.so.7"],
-    data = ["cuda/lib/libcudnn.so.7"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cudnn_header",
-    hdrs = [":cudnn-include"],
-    include_prefix = "third_party/gpus/cudnn",
-    strip_include_prefix = "cudnn/include",
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cufft",
-    srcs = ["cuda/lib/libcufft.so.10.0"],
-    data = ["cuda/lib/libcufft.so.10.0"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "curand",
-    srcs = ["cuda/lib/libcurand.so.10.0"],
-    data = ["cuda/lib/libcurand.so.10.0"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cuda",
-    deps = [
-        ":cublas",
-        ":cuda_headers",
-        ":cudart",
-        ":cudnn",
-        ":cufft",
-        ":curand",
-    ],
-)
-
-cuda_header_library(
-    name = "cupti_headers",
-    hdrs = [":cuda-extras"],
-    include_prefix = "third_party/gpus",
-    includes = ["cuda/extras/CUPTI/include/"],
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cupti_dsos",
-    data = ["cuda/lib/libcupti.so.10.0"],
-)
-
-cc_library(
-    name = "cusparse",
-    srcs = ["cuda/lib/libcusparse.so.10.0"],
-    data = ["cuda/lib/libcusparse.so.10.0"],
-    linkopts = ["-lgomp"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "libdevice_root",
-    data = [":cuda-nvvm"],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    deps = [
-        "@bazel_skylib//lib:selects",
-    ],
-)
-
-genrule(
-    name = "cuda-include",
-    outs = [
-        "cuda/include/CL/cl.h",
-        "cuda/include/CL/cl.hpp",
-        "cuda/include/CL/cl_egl.h",
-        "cuda/include/CL/cl_ext.h",
-        "cuda/include/CL/cl_gl.h",
-        "cuda/include/CL/cl_gl_ext.h",
-        "cuda/include/CL/cl_platform.h",
-        "cuda/include/CL/opencl.h",
-        "cuda/include/builtin_types.h",
-        "cuda/include/channel_descriptor.h",
-        "cuda/include/common_functions.h",
-        "cuda/include/cooperative_groups.h",
-        "cuda/include/cooperative_groups_helpers.h",
-        "cuda/include/crt/common_functions.h",
-        "cuda/include/crt/device_double_functions.h",
-        "cuda/include/crt/device_double_functions.hpp",
-        "cuda/include/crt/device_functions.h",
-        "cuda/include/crt/device_functions.hpp",
-        "cuda/include/crt/func_macro.h",
-        "cuda/include/crt/host_config.h",
-        "cuda/include/crt/host_defines.h",
-        "cuda/include/crt/host_runtime.h",
-        "cuda/include/crt/math_functions.h",
-        "cuda/include/crt/math_functions.hpp",
-        "cuda/include/crt/mma.h",
-        "cuda/include/crt/mma.hpp",
-        "cuda/include/crt/nvfunctional",
-        "cuda/include/crt/sm_70_rt.h",
-        "cuda/include/crt/sm_70_rt.hpp",
-        "cuda/include/crt/storage_class.h",
-        "cuda/include/cuComplex.h",
-        "cuda/include/cublas.h",
-        "cuda/include/cublasXt.h",
-        "cuda/include/cublas_api.h",
-        "cuda/include/cublas_v2.h",
-        "cuda/include/cuda.h",
-        "cuda/include/cudaEGL.h",
-        "cuda/include/cudaGL.h",
-        "cuda/include/cudaProfiler.h",
-        "cuda/include/cudaVDPAU.h",
-        "cuda/include/cuda_device_runtime_api.h",
-        "cuda/include/cuda_egl_interop.h",
-        "cuda/include/cuda_fp16.h",
-        "cuda/include/cuda_fp16.hpp",
-        "cuda/include/cuda_gl_interop.h",
-        "cuda/include/cuda_occupancy.h",
-        "cuda/include/cuda_profiler_api.h",
-        "cuda/include/cuda_runtime.h",
-        "cuda/include/cuda_runtime_api.h",
-        "cuda/include/cuda_surface_types.h",
-        "cuda/include/cuda_texture_types.h",
-        "cuda/include/cuda_vdpau_interop.h",
-        "cuda/include/cudalibxt.h",
-        "cuda/include/cudart_platform.h",
-        "cuda/include/cudnn.h",
-        "cuda/include/cufft.h",
-        "cuda/include/cufftXt.h",
-        "cuda/include/cufftw.h",
-        "cuda/include/curand.h",
-        "cuda/include/curand_discrete.h",
-        "cuda/include/curand_discrete2.h",
-        "cuda/include/curand_globals.h",
-        "cuda/include/curand_kernel.h",
-        "cuda/include/curand_lognormal.h",
-        "cuda/include/curand_mrg32k3a.h",
-        "cuda/include/curand_mtgp32.h",
-        "cuda/include/curand_mtgp32_host.h",
-        "cuda/include/curand_mtgp32_kernel.h",
-        "cuda/include/curand_mtgp32dc_p_11213.h",
-        "cuda/include/curand_normal.h",
-        "cuda/include/curand_normal_static.h",
-        "cuda/include/curand_philox4x32_x.h",
-        "cuda/include/curand_poisson.h",
-        "cuda/include/curand_precalc.h",
-        "cuda/include/curand_uniform.h",
-        "cuda/include/cusolverDn.h",
-        "cuda/include/cusolverRf.h",
-        "cuda/include/cusolverSp.h",
-        "cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h",
-        "cuda/include/cusolver_common.h",
-        "cuda/include/cusparse.h",
-        "cuda/include/cusparse_v2.h",
-        "cuda/include/device_atomic_functions.h",
-        "cuda/include/device_atomic_functions.hpp",
-        "cuda/include/device_double_functions.h",
-        "cuda/include/device_functions.h",
-        "cuda/include/device_launch_parameters.h",
-        "cuda/include/device_types.h",
-        "cuda/include/driver_functions.h",
-        "cuda/include/driver_types.h",
-        "cuda/include/fatBinaryCtl.h",
-        "cuda/include/fatbinary.h",
-        "cuda/include/host_config.h",
-        "cuda/include/host_defines.h",
-        "cuda/include/library_types.h",
-        "cuda/include/math_constants.h",
-        "cuda/include/math_functions.h",
-        "cuda/include/mma.h",
-        "cuda/include/npp.h",
-        "cuda/include/nppcore.h",
-        "cuda/include/nppdefs.h",
-        "cuda/include/nppi.h",
-        "cuda/include/nppi_arithmetic_and_logical_operations.h",
-        "cuda/include/nppi_color_conversion.h",
-        "cuda/include/nppi_compression_functions.h",
-        "cuda/include/nppi_computer_vision.h",
-        "cuda/include/nppi_data_exchange_and_initialization.h",
-        "cuda/include/nppi_filtering_functions.h",
-        "cuda/include/nppi_geometry_transforms.h",
-        "cuda/include/nppi_linear_transforms.h",
-        "cuda/include/nppi_morphological_operations.h",
-        "cuda/include/nppi_statistics_functions.h",
-        "cuda/include/nppi_support_functions.h",
-        "cuda/include/nppi_threshold_and_compare_operations.h",
-        "cuda/include/npps.h",
-        "cuda/include/npps_arithmetic_and_logical_operations.h",
-        "cuda/include/npps_conversion_functions.h",
-        "cuda/include/npps_filtering_functions.h",
-        "cuda/include/npps_initialization.h",
-        "cuda/include/npps_statistics_functions.h",
-        "cuda/include/npps_support_functions.h",
-        "cuda/include/nppversion.h",
-        "cuda/include/nvToolsExt.h",
-        "cuda/include/nvToolsExtCuda.h",
-        "cuda/include/nvToolsExtCudaRt.h",
-        "cuda/include/nvToolsExtMeta.h",
-        "cuda/include/nvToolsExtSync.h",
-        "cuda/include/nvblas.h",
-        "cuda/include/nvfunctional",
-        "cuda/include/nvgraph.h",
-        "cuda/include/nvjpeg.h",
-        "cuda/include/nvml.h",
-        "cuda/include/nvrtc.h",
-        "cuda/include/nvtx3/nvToolsExt.h",
-        "cuda/include/nvtx3/nvToolsExtCuda.h",
-        "cuda/include/nvtx3/nvToolsExtCudaRt.h",
-        "cuda/include/nvtx3/nvToolsExtOpenCL.h",
-        "cuda/include/nvtx3/nvToolsExtSync.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImpl.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplCore.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxInit.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxInitDecls.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxInitDefs.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxLinkOnce.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxTypes.h",
-        "cuda/include/sm_20_atomic_functions.h",
-        "cuda/include/sm_20_atomic_functions.hpp",
-        "cuda/include/sm_20_intrinsics.h",
-        "cuda/include/sm_20_intrinsics.hpp",
-        "cuda/include/sm_30_intrinsics.h",
-        "cuda/include/sm_30_intrinsics.hpp",
-        "cuda/include/sm_32_atomic_functions.h",
-        "cuda/include/sm_32_atomic_functions.hpp",
-        "cuda/include/sm_32_intrinsics.h",
-        "cuda/include/sm_32_intrinsics.hpp",
-        "cuda/include/sm_35_atomic_functions.h",
-        "cuda/include/sm_35_intrinsics.h",
-        "cuda/include/sm_60_atomic_functions.h",
-        "cuda/include/sm_60_atomic_functions.hpp",
-        "cuda/include/sm_61_intrinsics.h",
-        "cuda/include/sm_61_intrinsics.hpp",
-        "cuda/include/sobol_direction_vectors.h",
-        "cuda/include/surface_functions.h",
-        "cuda/include/surface_functions.hpp",
-        "cuda/include/surface_indirect_functions.h",
-        "cuda/include/surface_indirect_functions.hpp",
-        "cuda/include/surface_types.h",
-        "cuda/include/texture_fetch_functions.h",
-        "cuda/include/texture_fetch_functions.hpp",
-        "cuda/include/texture_indirect_functions.h",
-        "cuda/include/texture_indirect_functions.hpp",
-        "cuda/include/texture_types.h",
-        "cuda/include/thrust/adjacent_difference.h",
-        "cuda/include/thrust/advance.h",
-        "cuda/include/thrust/binary_search.h",
-        "cuda/include/thrust/complex.h",
-        "cuda/include/thrust/copy.h",
-        "cuda/include/thrust/count.h",
-        "cuda/include/thrust/detail/adjacent_difference.inl",
-        "cuda/include/thrust/detail/advance.inl",
-        "cuda/include/thrust/detail/alignment.h",
-        "cuda/include/thrust/detail/allocator/allocator_traits.h",
-        "cuda/include/thrust/detail/allocator/allocator_traits.inl",
-        "cuda/include/thrust/detail/allocator/copy_construct_range.h",
-        "cuda/include/thrust/detail/allocator/copy_construct_range.inl",
-        "cuda/include/thrust/detail/allocator/default_construct_range.h",
-        "cuda/include/thrust/detail/allocator/default_construct_range.inl",
-        "cuda/include/thrust/detail/allocator/destroy_range.h",
-        "cuda/include/thrust/detail/allocator/destroy_range.inl",
-        "cuda/include/thrust/detail/allocator/fill_construct_range.h",
-        "cuda/include/thrust/detail/allocator/fill_construct_range.inl",
-        "cuda/include/thrust/detail/allocator/malloc_allocator.h",
-        "cuda/include/thrust/detail/allocator/malloc_allocator.inl",
-        "cuda/include/thrust/detail/allocator/no_throw_allocator.h",
-        "cuda/include/thrust/detail/allocator/tagged_allocator.h",
-        "cuda/include/thrust/detail/allocator/tagged_allocator.inl",
-        "cuda/include/thrust/detail/allocator/temporary_allocator.h",
-        "cuda/include/thrust/detail/allocator/temporary_allocator.inl",
-        "cuda/include/thrust/detail/binary_search.inl",
-        "cuda/include/thrust/detail/complex/arithmetic.h",
-        "cuda/include/thrust/detail/complex/c99math.h",
-        "cuda/include/thrust/detail/complex/catrig.h",
-        "cuda/include/thrust/detail/complex/catrigf.h",
-        "cuda/include/thrust/detail/complex/ccosh.h",
-        "cuda/include/thrust/detail/complex/ccoshf.h",
-        "cuda/include/thrust/detail/complex/cexp.h",
-        "cuda/include/thrust/detail/complex/cexpf.h",
-        "cuda/include/thrust/detail/complex/clog.h",
-        "cuda/include/thrust/detail/complex/clogf.h",
-        "cuda/include/thrust/detail/complex/complex.inl",
-        "cuda/include/thrust/detail/complex/cpow.h",
-        "cuda/include/thrust/detail/complex/cproj.h",
-        "cuda/include/thrust/detail/complex/csinh.h",
-        "cuda/include/thrust/detail/complex/csinhf.h",
-        "cuda/include/thrust/detail/complex/csqrt.h",
-        "cuda/include/thrust/detail/complex/csqrtf.h",
-        "cuda/include/thrust/detail/complex/ctanh.h",
-        "cuda/include/thrust/detail/complex/ctanhf.h",
-        "cuda/include/thrust/detail/complex/math_private.h",
-        "cuda/include/thrust/detail/complex/stream.h",
-        "cuda/include/thrust/detail/config.h",
-        "cuda/include/thrust/detail/config/compiler.h",
-        "cuda/include/thrust/detail/config/compiler_fence.h",
-        "cuda/include/thrust/detail/config/config.h",
-        "cuda/include/thrust/detail/config/debug.h",
-        "cuda/include/thrust/detail/config/device_system.h",
-        "cuda/include/thrust/detail/config/exec_check_disable.h",
-        "cuda/include/thrust/detail/config/forceinline.h",
-        "cuda/include/thrust/detail/config/global_workarounds.h",
-        "cuda/include/thrust/detail/config/host_device.h",
-        "cuda/include/thrust/detail/config/host_system.h",
-        "cuda/include/thrust/detail/config/simple_defines.h",
-        "cuda/include/thrust/detail/contiguous_storage.h",
-        "cuda/include/thrust/detail/contiguous_storage.inl",
-        "cuda/include/thrust/detail/copy.h",
-        "cuda/include/thrust/detail/copy.inl",
-        "cuda/include/thrust/detail/copy_if.h",
-        "cuda/include/thrust/detail/copy_if.inl",
-        "cuda/include/thrust/detail/count.inl",
-        "cuda/include/thrust/detail/cstdint.h",
-        "cuda/include/thrust/detail/device_delete.inl",
-        "cuda/include/thrust/detail/device_free.inl",
-        "cuda/include/thrust/detail/device_malloc.inl",
-        "cuda/include/thrust/detail/device_new.inl",
-        "cuda/include/thrust/detail/device_ptr.inl",
-        "cuda/include/thrust/detail/device_reference.inl",
-        "cuda/include/thrust/detail/device_vector.inl",
-        "cuda/include/thrust/detail/dispatch/is_trivial_copy.h",
-        "cuda/include/thrust/detail/distance.inl",
-        "cuda/include/thrust/detail/equal.inl",
-        "cuda/include/thrust/detail/execute_with_allocator.h",
-        "cuda/include/thrust/detail/execution_policy.h",
-        "cuda/include/thrust/detail/extrema.inl",
-        "cuda/include/thrust/detail/fill.inl",
-        "cuda/include/thrust/detail/find.inl",
-        "cuda/include/thrust/detail/for_each.inl",
-        "cuda/include/thrust/detail/function.h",
-        "cuda/include/thrust/detail/functional.inl",
-        "cuda/include/thrust/detail/functional/actor.h",
-        "cuda/include/thrust/detail/functional/actor.inl",
-        "cuda/include/thrust/detail/functional/argument.h",
-        "cuda/include/thrust/detail/functional/composite.h",
-        "cuda/include/thrust/detail/functional/operators.h",
-        "cuda/include/thrust/detail/functional/operators/arithmetic_operators.h",
-        "cuda/include/thrust/detail/functional/operators/assignment_operator.h",
-        "cuda/include/thrust/detail/functional/operators/bitwise_operators.h",
-        "cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h",
-        "cuda/include/thrust/detail/functional/operators/logical_operators.h",
-        "cuda/include/thrust/detail/functional/operators/operator_adaptors.h",
-        "cuda/include/thrust/detail/functional/operators/relational_operators.h",
-        "cuda/include/thrust/detail/functional/placeholder.h",
-        "cuda/include/thrust/detail/functional/value.h",
-        "cuda/include/thrust/detail/gather.inl",
-        "cuda/include/thrust/detail/generate.inl",
-        "cuda/include/thrust/detail/get_iterator_value.h",
-        "cuda/include/thrust/detail/host_vector.inl",
-        "cuda/include/thrust/detail/inner_product.inl",
-        "cuda/include/thrust/detail/integer_math.h",
-        "cuda/include/thrust/detail/integer_traits.h",
-        "cuda/include/thrust/detail/internal_functional.h",
-        "cuda/include/thrust/detail/logical.inl",
-        "cuda/include/thrust/detail/malloc_and_free.h",
-        "cuda/include/thrust/detail/merge.inl",
-        "cuda/include/thrust/detail/minmax.h",
-        "cuda/include/thrust/detail/mismatch.inl",
-        "cuda/include/thrust/detail/mpl/math.h",
-        "cuda/include/thrust/detail/numeric_traits.h",
-        "cuda/include/thrust/detail/overlapped_copy.h",
-        "cuda/include/thrust/detail/pair.inl",
-        "cuda/include/thrust/detail/partition.inl",
-        "cuda/include/thrust/detail/pointer.h",
-        "cuda/include/thrust/detail/pointer.inl",
-        "cuda/include/thrust/detail/preprocessor.h",
-        "cuda/include/thrust/detail/range/head_flags.h",
-        "cuda/include/thrust/detail/range/tail_flags.h",
-        "cuda/include/thrust/detail/raw_pointer_cast.h",
-        "cuda/include/thrust/detail/raw_reference_cast.h",
-        "cuda/include/thrust/detail/reduce.inl",
-        "cuda/include/thrust/detail/reference.h",
-        "cuda/include/thrust/detail/reference.inl",
-        "cuda/include/thrust/detail/reference_forward_declaration.h",
-        "cuda/include/thrust/detail/remove.inl",
-        "cuda/include/thrust/detail/replace.inl",
-        "cuda/include/thrust/detail/reverse.inl",
-        "cuda/include/thrust/detail/scan.inl",
-        "cuda/include/thrust/detail/scatter.inl",
-        "cuda/include/thrust/detail/seq.h",
-        "cuda/include/thrust/detail/sequence.inl",
-        "cuda/include/thrust/detail/set_operations.inl",
-        "cuda/include/thrust/detail/sort.inl",
-        "cuda/include/thrust/detail/static_assert.h",
-        "cuda/include/thrust/detail/static_map.h",
-        "cuda/include/thrust/detail/swap.h",
-        "cuda/include/thrust/detail/swap.inl",
-        "cuda/include/thrust/detail/swap_ranges.inl",
-        "cuda/include/thrust/detail/tabulate.inl",
-        "cuda/include/thrust/detail/temporary_array.h",
-        "cuda/include/thrust/detail/temporary_array.inl",
-        "cuda/include/thrust/detail/temporary_buffer.h",
-        "cuda/include/thrust/detail/transform.inl",
-        "cuda/include/thrust/detail/transform_reduce.inl",
-        "cuda/include/thrust/detail/transform_scan.inl",
-        "cuda/include/thrust/detail/trivial_sequence.h",
-        "cuda/include/thrust/detail/tuple.inl",
-        "cuda/include/thrust/detail/tuple_meta_transform.h",
-        "cuda/include/thrust/detail/tuple_transform.h",
-        "cuda/include/thrust/detail/type_traits.h",
-        "cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h",
-        "cuda/include/thrust/detail/type_traits/function_traits.h",
-        "cuda/include/thrust/detail/type_traits/has_member_function.h",
-        "cuda/include/thrust/detail/type_traits/has_nested_type.h",
-        "cuda/include/thrust/detail/type_traits/has_trivial_assign.h",
-        "cuda/include/thrust/detail/type_traits/is_call_possible.h",
-        "cuda/include/thrust/detail/type_traits/is_metafunction_defined.h",
-        "cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h",
-        "cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h",
-        "cuda/include/thrust/detail/type_traits/minimum_type.h",
-        "cuda/include/thrust/detail/type_traits/pointer_traits.h",
-        "cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h",
-        "cuda/include/thrust/detail/uninitialized_copy.inl",
-        "cuda/include/thrust/detail/uninitialized_fill.inl",
-        "cuda/include/thrust/detail/unique.inl",
-        "cuda/include/thrust/detail/use_default.h",
-        "cuda/include/thrust/detail/util/align.h",
-        "cuda/include/thrust/detail/util/blocking.h",
-        "cuda/include/thrust/detail/vector_base.h",
-        "cuda/include/thrust/detail/vector_base.inl",
-        "cuda/include/thrust/device_allocator.h",
-        "cuda/include/thrust/device_delete.h",
-        "cuda/include/thrust/device_free.h",
-        "cuda/include/thrust/device_malloc.h",
-        "cuda/include/thrust/device_malloc_allocator.h",
-        "cuda/include/thrust/device_new.h",
-        "cuda/include/thrust/device_new_allocator.h",
-        "cuda/include/thrust/device_ptr.h",
-        "cuda/include/thrust/device_reference.h",
-        "cuda/include/thrust/device_vector.h",
-        "cuda/include/thrust/distance.h",
-        "cuda/include/thrust/equal.h",
-        "cuda/include/thrust/execution_policy.h",
-        "cuda/include/thrust/extrema.h",
-        "cuda/include/thrust/fill.h",
-        "cuda/include/thrust/find.h",
-        "cuda/include/thrust/for_each.h",
-        "cuda/include/thrust/functional.h",
-        "cuda/include/thrust/gather.h",
-        "cuda/include/thrust/generate.h",
-        "cuda/include/thrust/host_vector.h",
-        "cuda/include/thrust/inner_product.h",
-        "cuda/include/thrust/iterator/constant_iterator.h",
-        "cuda/include/thrust/iterator/counting_iterator.h",
-        "cuda/include/thrust/iterator/detail/any_assign.h",
-        "cuda/include/thrust/iterator/detail/any_system_tag.h",
-        "cuda/include/thrust/iterator/detail/constant_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/counting_iterator.inl",
-        "cuda/include/thrust/iterator/detail/device_system_tag.h",
-        "cuda/include/thrust/iterator/detail/discard_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/distance_from_result.h",
-        "cuda/include/thrust/iterator/detail/host_system_tag.h",
-        "cuda/include/thrust/iterator/detail/is_iterator_category.h",
-        "cuda/include/thrust/iterator/detail/is_trivial_iterator.h",
-        "cuda/include/thrust/iterator/detail/iterator_adaptor_base.h",
-        "cuda/include/thrust/iterator/detail/iterator_category_to_system.h",
-        "cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h",
-        "cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h",
-        "cuda/include/thrust/iterator/detail/iterator_facade_category.h",
-        "cuda/include/thrust/iterator/detail/iterator_traits.inl",
-        "cuda/include/thrust/iterator/detail/iterator_traversal_tags.h",
-        "cuda/include/thrust/iterator/detail/join_iterator.h",
-        "cuda/include/thrust/iterator/detail/minimum_category.h",
-        "cuda/include/thrust/iterator/detail/minimum_system.h",
-        "cuda/include/thrust/iterator/detail/normal_iterator.h",
-        "cuda/include/thrust/iterator/detail/permutation_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/retag.h",
-        "cuda/include/thrust/iterator/detail/reverse_iterator.inl",
-        "cuda/include/thrust/iterator/detail/reverse_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/tagged_iterator.h",
-        "cuda/include/thrust/iterator/detail/transform_iterator.inl",
-        "cuda/include/thrust/iterator/detail/transform_output_iterator.inl",
-        "cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h",
-        "cuda/include/thrust/iterator/detail/universal_categories.h",
-        "cuda/include/thrust/iterator/detail/zip_iterator.inl",
-        "cuda/include/thrust/iterator/detail/zip_iterator_base.h",
-        "cuda/include/thrust/iterator/discard_iterator.h",
-        "cuda/include/thrust/iterator/iterator_adaptor.h",
-        "cuda/include/thrust/iterator/iterator_categories.h",
-        "cuda/include/thrust/iterator/iterator_facade.h",
-        "cuda/include/thrust/iterator/iterator_traits.h",
-        "cuda/include/thrust/iterator/permutation_iterator.h",
-        "cuda/include/thrust/iterator/retag.h",
-        "cuda/include/thrust/iterator/reverse_iterator.h",
-        "cuda/include/thrust/iterator/transform_iterator.h",
-        "cuda/include/thrust/iterator/transform_output_iterator.h",
-        "cuda/include/thrust/iterator/zip_iterator.h",
-        "cuda/include/thrust/logical.h",
-        "cuda/include/thrust/memory.h",
-        "cuda/include/thrust/merge.h",
-        "cuda/include/thrust/mismatch.h",
-        "cuda/include/thrust/pair.h",
-        "cuda/include/thrust/partition.h",
-        "cuda/include/thrust/random.h",
-        "cuda/include/thrust/random/detail/discard_block_engine.inl",
-        "cuda/include/thrust/random/detail/linear_congruential_engine.inl",
-        "cuda/include/thrust/random/detail/linear_congruential_engine_discard.h",
-        "cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl",
-        "cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h",
-        "cuda/include/thrust/random/detail/mod.h",
-        "cuda/include/thrust/random/detail/normal_distribution.inl",
-        "cuda/include/thrust/random/detail/normal_distribution_base.h",
-        "cuda/include/thrust/random/detail/random_core_access.h",
-        "cuda/include/thrust/random/detail/subtract_with_carry_engine.inl",
-        "cuda/include/thrust/random/detail/uniform_int_distribution.inl",
-        "cuda/include/thrust/random/detail/uniform_real_distribution.inl",
-        "cuda/include/thrust/random/detail/xor_combine_engine.inl",
-        "cuda/include/thrust/random/detail/xor_combine_engine_max.h",
-        "cuda/include/thrust/random/discard_block_engine.h",
-        "cuda/include/thrust/random/linear_congruential_engine.h",
-        "cuda/include/thrust/random/linear_feedback_shift_engine.h",
-        "cuda/include/thrust/random/normal_distribution.h",
-        "cuda/include/thrust/random/subtract_with_carry_engine.h",
-        "cuda/include/thrust/random/uniform_int_distribution.h",
-        "cuda/include/thrust/random/uniform_real_distribution.h",
-        "cuda/include/thrust/random/xor_combine_engine.h",
-        "cuda/include/thrust/reduce.h",
-        "cuda/include/thrust/remove.h",
-        "cuda/include/thrust/replace.h",
-        "cuda/include/thrust/reverse.h",
-        "cuda/include/thrust/scan.h",
-        "cuda/include/thrust/scatter.h",
-        "cuda/include/thrust/sequence.h",
-        "cuda/include/thrust/set_operations.h",
-        "cuda/include/thrust/sort.h",
-        "cuda/include/thrust/swap.h",
-        "cuda/include/thrust/system/cpp/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/cpp/detail/assign_value.h",
-        "cuda/include/thrust/system/cpp/detail/binary_search.h",
-        "cuda/include/thrust/system/cpp/detail/copy.h",
-        "cuda/include/thrust/system/cpp/detail/copy_if.h",
-        "cuda/include/thrust/system/cpp/detail/count.h",
-        "cuda/include/thrust/system/cpp/detail/equal.h",
-        "cuda/include/thrust/system/cpp/detail/execution_policy.h",
-        "cuda/include/thrust/system/cpp/detail/extrema.h",
-        "cuda/include/thrust/system/cpp/detail/fill.h",
-        "cuda/include/thrust/system/cpp/detail/find.h",
-        "cuda/include/thrust/system/cpp/detail/for_each.h",
-        "cuda/include/thrust/system/cpp/detail/gather.h",
-        "cuda/include/thrust/system/cpp/detail/generate.h",
-        "cuda/include/thrust/system/cpp/detail/get_value.h",
-        "cuda/include/thrust/system/cpp/detail/inner_product.h",
-        "cuda/include/thrust/system/cpp/detail/iter_swap.h",
-        "cuda/include/thrust/system/cpp/detail/logical.h",
-        "cuda/include/thrust/system/cpp/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/cpp/detail/memory.inl",
-        "cuda/include/thrust/system/cpp/detail/merge.h",
-        "cuda/include/thrust/system/cpp/detail/mismatch.h",
-        "cuda/include/thrust/system/cpp/detail/par.h",
-        "cuda/include/thrust/system/cpp/detail/partition.h",
-        "cuda/include/thrust/system/cpp/detail/reduce.h",
-        "cuda/include/thrust/system/cpp/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/cpp/detail/remove.h",
-        "cuda/include/thrust/system/cpp/detail/replace.h",
-        "cuda/include/thrust/system/cpp/detail/reverse.h",
-        "cuda/include/thrust/system/cpp/detail/scan.h",
-        "cuda/include/thrust/system/cpp/detail/scan_by_key.h",
-        "cuda/include/thrust/system/cpp/detail/scatter.h",
-        "cuda/include/thrust/system/cpp/detail/sequence.h",
-        "cuda/include/thrust/system/cpp/detail/set_operations.h",
-        "cuda/include/thrust/system/cpp/detail/sort.h",
-        "cuda/include/thrust/system/cpp/detail/swap_ranges.h",
-        "cuda/include/thrust/system/cpp/detail/tabulate.h",
-        "cuda/include/thrust/system/cpp/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/cpp/detail/transform.h",
-        "cuda/include/thrust/system/cpp/detail/transform_reduce.h",
-        "cuda/include/thrust/system/cpp/detail/transform_scan.h",
-        "cuda/include/thrust/system/cpp/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/cpp/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/cpp/detail/unique.h",
-        "cuda/include/thrust/system/cpp/detail/unique_by_key.h",
-        "cuda/include/thrust/system/cpp/detail/vector.inl",
-        "cuda/include/thrust/system/cpp/execution_policy.h",
-        "cuda/include/thrust/system/cpp/memory.h",
-        "cuda/include/thrust/system/cpp/vector.h",
-        "cuda/include/thrust/system/cuda/config.h",
-        "cuda/include/thrust/system/cuda/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/cuda/detail/assign_value.h",
-        "cuda/include/thrust/system/cuda/detail/binary_search.h",
-        "cuda/include/thrust/system/cuda/detail/copy.h",
-        "cuda/include/thrust/system/cuda/detail/copy_if.h",
-        "cuda/include/thrust/system/cuda/detail/core/agent_launcher.h",
-        "cuda/include/thrust/system/cuda/detail/core/alignment.h",
-        "cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h",
-        "cuda/include/thrust/system/cuda/detail/core/util.h",
-        "cuda/include/thrust/system/cuda/detail/count.h",
-        "cuda/include/thrust/system/cuda/detail/cross_system.h",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/cub.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_device.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_type.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/equal.h",
-        "cuda/include/thrust/system/cuda/detail/error.inl",
-        "cuda/include/thrust/system/cuda/detail/execution_policy.h",
-        "cuda/include/thrust/system/cuda/detail/extrema.h",
-        "cuda/include/thrust/system/cuda/detail/fill.h",
-        "cuda/include/thrust/system/cuda/detail/find.h",
-        "cuda/include/thrust/system/cuda/detail/for_each.h",
-        "cuda/include/thrust/system/cuda/detail/gather.h",
-        "cuda/include/thrust/system/cuda/detail/generate.h",
-        "cuda/include/thrust/system/cuda/detail/get_value.h",
-        "cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h",
-        "cuda/include/thrust/system/cuda/detail/guarded_driver_types.h",
-        "cuda/include/thrust/system/cuda/detail/inner_product.h",
-        "cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h",
-        "cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h",
-        "cuda/include/thrust/system/cuda/detail/iter_swap.h",
-        "cuda/include/thrust/system/cuda/detail/logical.h",
-        "cuda/include/thrust/system/cuda/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/cuda/detail/memory.inl",
-        "cuda/include/thrust/system/cuda/detail/merge.h",
-        "cuda/include/thrust/system/cuda/detail/mismatch.h",
-        "cuda/include/thrust/system/cuda/detail/par.h",
-        "cuda/include/thrust/system/cuda/detail/par_to_seq.h",
-        "cuda/include/thrust/system/cuda/detail/parallel_for.h",
-        "cuda/include/thrust/system/cuda/detail/partition.h",
-        "cuda/include/thrust/system/cuda/detail/reduce.h",
-        "cuda/include/thrust/system/cuda/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/cuda/detail/remove.h",
-        "cuda/include/thrust/system/cuda/detail/replace.h",
-        "cuda/include/thrust/system/cuda/detail/reverse.h",
-        "cuda/include/thrust/system/cuda/detail/scan.h",
-        "cuda/include/thrust/system/cuda/detail/scan_by_key.h",
-        "cuda/include/thrust/system/cuda/detail/scatter.h",
-        "cuda/include/thrust/system/cuda/detail/sequence.h",
-        "cuda/include/thrust/system/cuda/detail/set_operations.h",
-        "cuda/include/thrust/system/cuda/detail/sort.h",
-        "cuda/include/thrust/system/cuda/detail/swap_ranges.h",
-        "cuda/include/thrust/system/cuda/detail/tabulate.h",
-        "cuda/include/thrust/system/cuda/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/cuda/detail/terminate.h",
-        "cuda/include/thrust/system/cuda/detail/transform.h",
-        "cuda/include/thrust/system/cuda/detail/transform_reduce.h",
-        "cuda/include/thrust/system/cuda/detail/transform_scan.h",
-        "cuda/include/thrust/system/cuda/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/cuda/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/cuda/detail/unique.h",
-        "cuda/include/thrust/system/cuda/detail/unique_by_key.h",
-        "cuda/include/thrust/system/cuda/detail/util.h",
-        "cuda/include/thrust/system/cuda/detail/vector.inl",
-        "cuda/include/thrust/system/cuda/error.h",
-        "cuda/include/thrust/system/cuda/execution_policy.h",
-        "cuda/include/thrust/system/cuda/experimental/pinned_allocator.h",
-        "cuda/include/thrust/system/cuda/memory.h",
-        "cuda/include/thrust/system/cuda/vector.h",
-        "cuda/include/thrust/system/detail/adl/adjacent_difference.h",
-        "cuda/include/thrust/system/detail/adl/assign_value.h",
-        "cuda/include/thrust/system/detail/adl/binary_search.h",
-        "cuda/include/thrust/system/detail/adl/copy.h",
-        "cuda/include/thrust/system/detail/adl/copy_if.h",
-        "cuda/include/thrust/system/detail/adl/count.h",
-        "cuda/include/thrust/system/detail/adl/equal.h",
-        "cuda/include/thrust/system/detail/adl/extrema.h",
-        "cuda/include/thrust/system/detail/adl/fill.h",
-        "cuda/include/thrust/system/detail/adl/find.h",
-        "cuda/include/thrust/system/detail/adl/for_each.h",
-        "cuda/include/thrust/system/detail/adl/gather.h",
-        "cuda/include/thrust/system/detail/adl/generate.h",
-        "cuda/include/thrust/system/detail/adl/get_value.h",
-        "cuda/include/thrust/system/detail/adl/inner_product.h",
-        "cuda/include/thrust/system/detail/adl/iter_swap.h",
-        "cuda/include/thrust/system/detail/adl/logical.h",
-        "cuda/include/thrust/system/detail/adl/malloc_and_free.h",
-        "cuda/include/thrust/system/detail/adl/merge.h",
-        "cuda/include/thrust/system/detail/adl/mismatch.h",
-        "cuda/include/thrust/system/detail/adl/partition.h",
-        "cuda/include/thrust/system/detail/adl/reduce.h",
-        "cuda/include/thrust/system/detail/adl/reduce_by_key.h",
-        "cuda/include/thrust/system/detail/adl/remove.h",
-        "cuda/include/thrust/system/detail/adl/replace.h",
-        "cuda/include/thrust/system/detail/adl/reverse.h",
-        "cuda/include/thrust/system/detail/adl/scan.h",
-        "cuda/include/thrust/system/detail/adl/scan_by_key.h",
-        "cuda/include/thrust/system/detail/adl/scatter.h",
-        "cuda/include/thrust/system/detail/adl/sequence.h",
-        "cuda/include/thrust/system/detail/adl/set_operations.h",
-        "cuda/include/thrust/system/detail/adl/sort.h",
-        "cuda/include/thrust/system/detail/adl/swap_ranges.h",
-        "cuda/include/thrust/system/detail/adl/tabulate.h",
-        "cuda/include/thrust/system/detail/adl/temporary_buffer.h",
-        "cuda/include/thrust/system/detail/adl/transform.h",
-        "cuda/include/thrust/system/detail/adl/transform_reduce.h",
-        "cuda/include/thrust/system/detail/adl/transform_scan.h",
-        "cuda/include/thrust/system/detail/adl/uninitialized_copy.h",
-        "cuda/include/thrust/system/detail/adl/uninitialized_fill.h",
-        "cuda/include/thrust/system/detail/adl/unique.h",
-        "cuda/include/thrust/system/detail/adl/unique_by_key.h",
-        "cuda/include/thrust/system/detail/bad_alloc.h",
-        "cuda/include/thrust/system/detail/errno.h",
-        "cuda/include/thrust/system/detail/error_category.inl",
-        "cuda/include/thrust/system/detail/error_code.inl",
-        "cuda/include/thrust/system/detail/error_condition.inl",
-        "cuda/include/thrust/system/detail/generic/adjacent_difference.h",
-        "cuda/include/thrust/system/detail/generic/adjacent_difference.inl",
-        "cuda/include/thrust/system/detail/generic/advance.h",
-        "cuda/include/thrust/system/detail/generic/advance.inl",
-        "cuda/include/thrust/system/detail/generic/binary_search.h",
-        "cuda/include/thrust/system/detail/generic/binary_search.inl",
-        "cuda/include/thrust/system/detail/generic/copy.h",
-        "cuda/include/thrust/system/detail/generic/copy.inl",
-        "cuda/include/thrust/system/detail/generic/copy_if.h",
-        "cuda/include/thrust/system/detail/generic/copy_if.inl",
-        "cuda/include/thrust/system/detail/generic/count.h",
-        "cuda/include/thrust/system/detail/generic/count.inl",
-        "cuda/include/thrust/system/detail/generic/distance.h",
-        "cuda/include/thrust/system/detail/generic/distance.inl",
-        "cuda/include/thrust/system/detail/generic/equal.h",
-        "cuda/include/thrust/system/detail/generic/equal.inl",
-        "cuda/include/thrust/system/detail/generic/extrema.h",
-        "cuda/include/thrust/system/detail/generic/extrema.inl",
-        "cuda/include/thrust/system/detail/generic/fill.h",
-        "cuda/include/thrust/system/detail/generic/find.h",
-        "cuda/include/thrust/system/detail/generic/find.inl",
-        "cuda/include/thrust/system/detail/generic/for_each.h",
-        "cuda/include/thrust/system/detail/generic/gather.h",
-        "cuda/include/thrust/system/detail/generic/gather.inl",
-        "cuda/include/thrust/system/detail/generic/generate.h",
-        "cuda/include/thrust/system/detail/generic/generate.inl",
-        "cuda/include/thrust/system/detail/generic/inner_product.h",
-        "cuda/include/thrust/system/detail/generic/inner_product.inl",
-        "cuda/include/thrust/system/detail/generic/logical.h",
-        "cuda/include/thrust/system/detail/generic/memory.h",
-        "cuda/include/thrust/system/detail/generic/memory.inl",
-        "cuda/include/thrust/system/detail/generic/merge.h",
-        "cuda/include/thrust/system/detail/generic/merge.inl",
-        "cuda/include/thrust/system/detail/generic/mismatch.h",
-        "cuda/include/thrust/system/detail/generic/mismatch.inl",
-        "cuda/include/thrust/system/detail/generic/partition.h",
-        "cuda/include/thrust/system/detail/generic/partition.inl",
-        "cuda/include/thrust/system/detail/generic/reduce.h",
-        "cuda/include/thrust/system/detail/generic/reduce.inl",
-        "cuda/include/thrust/system/detail/generic/reduce_by_key.h",
-        "cuda/include/thrust/system/detail/generic/reduce_by_key.inl",
-        "cuda/include/thrust/system/detail/generic/remove.h",
-        "cuda/include/thrust/system/detail/generic/remove.inl",
-        "cuda/include/thrust/system/detail/generic/replace.h",
-        "cuda/include/thrust/system/detail/generic/replace.inl",
-        "cuda/include/thrust/system/detail/generic/reverse.h",
-        "cuda/include/thrust/system/detail/generic/reverse.inl",
-        "cuda/include/thrust/system/detail/generic/scalar/binary_search.h",
-        "cuda/include/thrust/system/detail/generic/scalar/binary_search.inl",
-        "cuda/include/thrust/system/detail/generic/scan.h",
-        "cuda/include/thrust/system/detail/generic/scan.inl",
-        "cuda/include/thrust/system/detail/generic/scan_by_key.h",
-        "cuda/include/thrust/system/detail/generic/scan_by_key.inl",
-        "cuda/include/thrust/system/detail/generic/scatter.h",
-        "cuda/include/thrust/system/detail/generic/scatter.inl",
-        "cuda/include/thrust/system/detail/generic/select_system.h",
-        "cuda/include/thrust/system/detail/generic/sequence.h",
-        "cuda/include/thrust/system/detail/generic/sequence.inl",
-        "cuda/include/thrust/system/detail/generic/set_operations.h",
-        "cuda/include/thrust/system/detail/generic/set_operations.inl",
-        "cuda/include/thrust/system/detail/generic/sort.h",
-        "cuda/include/thrust/system/detail/generic/sort.inl",
-        "cuda/include/thrust/system/detail/generic/swap_ranges.h",
-        "cuda/include/thrust/system/detail/generic/swap_ranges.inl",
-        "cuda/include/thrust/system/detail/generic/tabulate.h",
-        "cuda/include/thrust/system/detail/generic/tabulate.inl",
-        "cuda/include/thrust/system/detail/generic/tag.h",
-        "cuda/include/thrust/system/detail/generic/temporary_buffer.h",
-        "cuda/include/thrust/system/detail/generic/temporary_buffer.inl",
-        "cuda/include/thrust/system/detail/generic/transform.h",
-        "cuda/include/thrust/system/detail/generic/transform.inl",
-        "cuda/include/thrust/system/detail/generic/transform_reduce.h",
-        "cuda/include/thrust/system/detail/generic/transform_reduce.inl",
-        "cuda/include/thrust/system/detail/generic/transform_scan.h",
-        "cuda/include/thrust/system/detail/generic/transform_scan.inl",
-        "cuda/include/thrust/system/detail/generic/type_traits.h",
-        "cuda/include/thrust/system/detail/generic/uninitialized_copy.h",
-        "cuda/include/thrust/system/detail/generic/uninitialized_copy.inl",
-        "cuda/include/thrust/system/detail/generic/uninitialized_fill.h",
-        "cuda/include/thrust/system/detail/generic/uninitialized_fill.inl",
-        "cuda/include/thrust/system/detail/generic/unique.h",
-        "cuda/include/thrust/system/detail/generic/unique.inl",
-        "cuda/include/thrust/system/detail/generic/unique_by_key.h",
-        "cuda/include/thrust/system/detail/generic/unique_by_key.inl",
-        "cuda/include/thrust/system/detail/internal/decompose.h",
-        "cuda/include/thrust/system/detail/sequential/adjacent_difference.h",
-        "cuda/include/thrust/system/detail/sequential/assign_value.h",
-        "cuda/include/thrust/system/detail/sequential/binary_search.h",
-        "cuda/include/thrust/system/detail/sequential/copy.h",
-        "cuda/include/thrust/system/detail/sequential/copy.inl",
-        "cuda/include/thrust/system/detail/sequential/copy_backward.h",
-        "cuda/include/thrust/system/detail/sequential/copy_if.h",
-        "cuda/include/thrust/system/detail/sequential/count.h",
-        "cuda/include/thrust/system/detail/sequential/equal.h",
-        "cuda/include/thrust/system/detail/sequential/execution_policy.h",
-        "cuda/include/thrust/system/detail/sequential/extrema.h",
-        "cuda/include/thrust/system/detail/sequential/fill.h",
-        "cuda/include/thrust/system/detail/sequential/find.h",
-        "cuda/include/thrust/system/detail/sequential/for_each.h",
-        "cuda/include/thrust/system/detail/sequential/gather.h",
-        "cuda/include/thrust/system/detail/sequential/general_copy.h",
-        "cuda/include/thrust/system/detail/sequential/generate.h",
-        "cuda/include/thrust/system/detail/sequential/get_value.h",
-        "cuda/include/thrust/system/detail/sequential/inner_product.h",
-        "cuda/include/thrust/system/detail/sequential/insertion_sort.h",
-        "cuda/include/thrust/system/detail/sequential/iter_swap.h",
-        "cuda/include/thrust/system/detail/sequential/logical.h",
-        "cuda/include/thrust/system/detail/sequential/malloc_and_free.h",
-        "cuda/include/thrust/system/detail/sequential/merge.h",
-        "cuda/include/thrust/system/detail/sequential/merge.inl",
-        "cuda/include/thrust/system/detail/sequential/mismatch.h",
-        "cuda/include/thrust/system/detail/sequential/partition.h",
-        "cuda/include/thrust/system/detail/sequential/reduce.h",
-        "cuda/include/thrust/system/detail/sequential/reduce_by_key.h",
-        "cuda/include/thrust/system/detail/sequential/remove.h",
-        "cuda/include/thrust/system/detail/sequential/replace.h",
-        "cuda/include/thrust/system/detail/sequential/reverse.h",
-        "cuda/include/thrust/system/detail/sequential/scan.h",
-        "cuda/include/thrust/system/detail/sequential/scan_by_key.h",
-        "cuda/include/thrust/system/detail/sequential/scatter.h",
-        "cuda/include/thrust/system/detail/sequential/sequence.h",
-        "cuda/include/thrust/system/detail/sequential/set_operations.h",
-        "cuda/include/thrust/system/detail/sequential/sort.h",
-        "cuda/include/thrust/system/detail/sequential/sort.inl",
-        "cuda/include/thrust/system/detail/sequential/stable_merge_sort.h",
-        "cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl",
-        "cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h",
-        "cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl",
-        "cuda/include/thrust/system/detail/sequential/stable_radix_sort.h",
-        "cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl",
-        "cuda/include/thrust/system/detail/sequential/swap_ranges.h",
-        "cuda/include/thrust/system/detail/sequential/tabulate.h",
-        "cuda/include/thrust/system/detail/sequential/temporary_buffer.h",
-        "cuda/include/thrust/system/detail/sequential/transform.h",
-        "cuda/include/thrust/system/detail/sequential/transform_reduce.h",
-        "cuda/include/thrust/system/detail/sequential/transform_scan.h",
-        "cuda/include/thrust/system/detail/sequential/trivial_copy.h",
-        "cuda/include/thrust/system/detail/sequential/uninitialized_copy.h",
-        "cuda/include/thrust/system/detail/sequential/uninitialized_fill.h",
-        "cuda/include/thrust/system/detail/sequential/unique.h",
-        "cuda/include/thrust/system/detail/sequential/unique_by_key.h",
-        "cuda/include/thrust/system/detail/system_error.inl",
-        "cuda/include/thrust/system/error_code.h",
-        "cuda/include/thrust/system/omp/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/omp/detail/assign_value.h",
-        "cuda/include/thrust/system/omp/detail/binary_search.h",
-        "cuda/include/thrust/system/omp/detail/copy.h",
-        "cuda/include/thrust/system/omp/detail/copy.inl",
-        "cuda/include/thrust/system/omp/detail/copy_if.h",
-        "cuda/include/thrust/system/omp/detail/copy_if.inl",
-        "cuda/include/thrust/system/omp/detail/count.h",
-        "cuda/include/thrust/system/omp/detail/default_decomposition.h",
-        "cuda/include/thrust/system/omp/detail/default_decomposition.inl",
-        "cuda/include/thrust/system/omp/detail/equal.h",
-        "cuda/include/thrust/system/omp/detail/execution_policy.h",
-        "cuda/include/thrust/system/omp/detail/extrema.h",
-        "cuda/include/thrust/system/omp/detail/fill.h",
-        "cuda/include/thrust/system/omp/detail/find.h",
-        "cuda/include/thrust/system/omp/detail/for_each.h",
-        "cuda/include/thrust/system/omp/detail/for_each.inl",
-        "cuda/include/thrust/system/omp/detail/gather.h",
-        "cuda/include/thrust/system/omp/detail/generate.h",
-        "cuda/include/thrust/system/omp/detail/get_value.h",
-        "cuda/include/thrust/system/omp/detail/inner_product.h",
-        "cuda/include/thrust/system/omp/detail/iter_swap.h",
-        "cuda/include/thrust/system/omp/detail/logical.h",
-        "cuda/include/thrust/system/omp/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/omp/detail/memory.inl",
-        "cuda/include/thrust/system/omp/detail/merge.h",
-        "cuda/include/thrust/system/omp/detail/mismatch.h",
-        "cuda/include/thrust/system/omp/detail/par.h",
-        "cuda/include/thrust/system/omp/detail/partition.h",
-        "cuda/include/thrust/system/omp/detail/partition.inl",
-        "cuda/include/thrust/system/omp/detail/reduce.h",
-        "cuda/include/thrust/system/omp/detail/reduce.inl",
-        "cuda/include/thrust/system/omp/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/omp/detail/reduce_by_key.inl",
-        "cuda/include/thrust/system/omp/detail/reduce_intervals.h",
-        "cuda/include/thrust/system/omp/detail/reduce_intervals.inl",
-        "cuda/include/thrust/system/omp/detail/remove.h",
-        "cuda/include/thrust/system/omp/detail/remove.inl",
-        "cuda/include/thrust/system/omp/detail/replace.h",
-        "cuda/include/thrust/system/omp/detail/reverse.h",
-        "cuda/include/thrust/system/omp/detail/scan.h",
-        "cuda/include/thrust/system/omp/detail/scan_by_key.h",
-        "cuda/include/thrust/system/omp/detail/scatter.h",
-        "cuda/include/thrust/system/omp/detail/sequence.h",
-        "cuda/include/thrust/system/omp/detail/set_operations.h",
-        "cuda/include/thrust/system/omp/detail/sort.h",
-        "cuda/include/thrust/system/omp/detail/sort.inl",
-        "cuda/include/thrust/system/omp/detail/swap_ranges.h",
-        "cuda/include/thrust/system/omp/detail/tabulate.h",
-        "cuda/include/thrust/system/omp/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/omp/detail/transform.h",
-        "cuda/include/thrust/system/omp/detail/transform_reduce.h",
-        "cuda/include/thrust/system/omp/detail/transform_scan.h",
-        "cuda/include/thrust/system/omp/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/omp/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/omp/detail/unique.h",
-        "cuda/include/thrust/system/omp/detail/unique.inl",
-        "cuda/include/thrust/system/omp/detail/unique_by_key.h",
-        "cuda/include/thrust/system/omp/detail/unique_by_key.inl",
-        "cuda/include/thrust/system/omp/detail/vector.inl",
-        "cuda/include/thrust/system/omp/execution_policy.h",
-        "cuda/include/thrust/system/omp/memory.h",
-        "cuda/include/thrust/system/omp/vector.h",
-        "cuda/include/thrust/system/system_error.h",
-        "cuda/include/thrust/system/tbb/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/tbb/detail/assign_value.h",
-        "cuda/include/thrust/system/tbb/detail/binary_search.h",
-        "cuda/include/thrust/system/tbb/detail/copy.h",
-        "cuda/include/thrust/system/tbb/detail/copy.inl",
-        "cuda/include/thrust/system/tbb/detail/copy_if.h",
-        "cuda/include/thrust/system/tbb/detail/copy_if.inl",
-        "cuda/include/thrust/system/tbb/detail/count.h",
-        "cuda/include/thrust/system/tbb/detail/equal.h",
-        "cuda/include/thrust/system/tbb/detail/execution_policy.h",
-        "cuda/include/thrust/system/tbb/detail/extrema.h",
-        "cuda/include/thrust/system/tbb/detail/fill.h",
-        "cuda/include/thrust/system/tbb/detail/find.h",
-        "cuda/include/thrust/system/tbb/detail/for_each.h",
-        "cuda/include/thrust/system/tbb/detail/for_each.inl",
-        "cuda/include/thrust/system/tbb/detail/gather.h",
-        "cuda/include/thrust/system/tbb/detail/generate.h",
-        "cuda/include/thrust/system/tbb/detail/get_value.h",
-        "cuda/include/thrust/system/tbb/detail/inner_product.h",
-        "cuda/include/thrust/system/tbb/detail/iter_swap.h",
-        "cuda/include/thrust/system/tbb/detail/logical.h",
-        "cuda/include/thrust/system/tbb/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/tbb/detail/memory.inl",
-        "cuda/include/thrust/system/tbb/detail/merge.h",
-        "cuda/include/thrust/system/tbb/detail/merge.inl",
-        "cuda/include/thrust/system/tbb/detail/mismatch.h",
-        "cuda/include/thrust/system/tbb/detail/par.h",
-        "cuda/include/thrust/system/tbb/detail/partition.h",
-        "cuda/include/thrust/system/tbb/detail/partition.inl",
-        "cuda/include/thrust/system/tbb/detail/reduce.h",
-        "cuda/include/thrust/system/tbb/detail/reduce.inl",
-        "cuda/include/thrust/system/tbb/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/tbb/detail/reduce_by_key.inl",
-        "cuda/include/thrust/system/tbb/detail/reduce_intervals.h",
-        "cuda/include/thrust/system/tbb/detail/remove.h",
-        "cuda/include/thrust/system/tbb/detail/remove.inl",
-        "cuda/include/thrust/system/tbb/detail/replace.h",
-        "cuda/include/thrust/system/tbb/detail/reverse.h",
-        "cuda/include/thrust/system/tbb/detail/scan.h",
-        "cuda/include/thrust/system/tbb/detail/scan.inl",
-        "cuda/include/thrust/system/tbb/detail/scan_by_key.h",
-        "cuda/include/thrust/system/tbb/detail/scatter.h",
-        "cuda/include/thrust/system/tbb/detail/sequence.h",
-        "cuda/include/thrust/system/tbb/detail/set_operations.h",
-        "cuda/include/thrust/system/tbb/detail/sort.h",
-        "cuda/include/thrust/system/tbb/detail/sort.inl",
-        "cuda/include/thrust/system/tbb/detail/swap_ranges.h",
-        "cuda/include/thrust/system/tbb/detail/tabulate.h",
-        "cuda/include/thrust/system/tbb/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/tbb/detail/transform.h",
-        "cuda/include/thrust/system/tbb/detail/transform_reduce.h",
-        "cuda/include/thrust/system/tbb/detail/transform_scan.h",
-        "cuda/include/thrust/system/tbb/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/tbb/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/tbb/detail/unique.h",
-        "cuda/include/thrust/system/tbb/detail/unique.inl",
-        "cuda/include/thrust/system/tbb/detail/unique_by_key.h",
-        "cuda/include/thrust/system/tbb/detail/unique_by_key.inl",
-        "cuda/include/thrust/system/tbb/detail/vector.inl",
-        "cuda/include/thrust/system/tbb/execution_policy.h",
-        "cuda/include/thrust/system/tbb/memory.h",
-        "cuda/include/thrust/system/tbb/vector.h",
-        "cuda/include/thrust/system_error.h",
-        "cuda/include/thrust/tabulate.h",
-        "cuda/include/thrust/transform.h",
-        "cuda/include/thrust/transform_reduce.h",
-        "cuda/include/thrust/transform_scan.h",
-        "cuda/include/thrust/tuple.h",
-        "cuda/include/thrust/uninitialized_copy.h",
-        "cuda/include/thrust/uninitialized_fill.h",
-        "cuda/include/thrust/unique.h",
-        "cuda/include/thrust/version.h",
-        "cuda/include/vector_functions.h",
-        "cuda/include/vector_functions.hpp",
-        "cuda/include/vector_types.h",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.0/include/." "$(@D)/cuda/include/" """,
-)
-
-genrule(
-    name = "cuda-nvvm",
-    outs = [
-        "cuda/nvvm/libdevice/libdevice.10.bc",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.0/nvvm/libdevice/." "$(@D)/" """,
-)
-
-genrule(
-    name = "cuda-extras",
-    outs = [
-        "cuda/extras/CUPTI/include/GL/gl.h",
-        "cuda/extras/CUPTI/include/GL/glew.h",
-        "cuda/extras/CUPTI/include/GL/glext.h",
-        "cuda/extras/CUPTI/include/GL/glu.h",
-        "cuda/extras/CUPTI/include/GL/glut.h",
-        "cuda/extras/CUPTI/include/GL/glx.h",
-        "cuda/extras/CUPTI/include/GL/glxext.h",
-        "cuda/extras/CUPTI/include/GL/wglew.h",
-        "cuda/extras/CUPTI/include/GL/wglext.h",
-        "cuda/extras/CUPTI/include/cuda_stdint.h",
-        "cuda/extras/CUPTI/include/cupti.h",
-        "cuda/extras/CUPTI/include/cupti_activity.h",
-        "cuda/extras/CUPTI/include/cupti_callbacks.h",
-        "cuda/extras/CUPTI/include/cupti_driver_cbid.h",
-        "cuda/extras/CUPTI/include/cupti_events.h",
-        "cuda/extras/CUPTI/include/cupti_metrics.h",
-        "cuda/extras/CUPTI/include/cupti_nvtx_cbid.h",
-        "cuda/extras/CUPTI/include/cupti_result.h",
-        "cuda/extras/CUPTI/include/cupti_runtime_cbid.h",
-        "cuda/extras/CUPTI/include/cupti_version.h",
-        "cuda/extras/CUPTI/include/generated_cudaGL_meta.h",
-        "cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h",
-        "cuda/extras/CUPTI/include/generated_nvtx_meta.h",
-        "cuda/extras/CUPTI/include/openacc/cupti_openacc.h",
-        "cuda/extras/CUPTI/include/openmp/cupti_openmp.h",
-        "cuda/extras/CUPTI/include/openmp/ompt.h",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.0/extras/CUPTI/include/." "$(@D)/cuda/extras/CUPTI/include/" """,
-)
-
-genrule(
-    name = "cublas-include",
-    outs = [
-        "cublas/include/cublas.h",
-        "cublas/include/cublas_v2.h",
-        "cublas/include/cublas_api.h",
-    ],
-    cmd = """cp -f "/usr/local/cuda-10.0/include/cublas.h" "$(location cublas/include/cublas.h)" && \
-cp -f "/usr/local/cuda-10.0/include/cublas_v2.h" "$(location cublas/include/cublas_v2.h)" && \
-cp -f "/usr/local/cuda-10.0/include/cublas_api.h" "$(location cublas/include/cublas_api.h)" """,
-)
-
-genrule(
-    name = "cuda-lib",
-    outs = [
-        "cuda/lib/libcuda.so",
-        "cuda/lib/libcudart.so.10.0",
-        "cuda/lib/libcudart_static.a",
-        "cuda/lib/libcublas.so.10.0",
-        "cuda/lib/libcusolver.so.10.0",
-        "cuda/lib/libcurand.so.10.0",
-        "cuda/lib/libcufft.so.10.0",
-        "cuda/lib/libcudnn.so.7",
-        "cuda/lib/libcupti.so.10.0",
-        "cuda/lib/libcusparse.so.10.0",
-    ],
-    cmd = """cp -f "/usr/local/cuda-10.0/lib64/stubs/libcuda.so" "$(location cuda/lib/libcuda.so)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcudart.so.10.0" "$(location cuda/lib/libcudart.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcudart_static.a" "$(location cuda/lib/libcudart_static.a)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcublas.so.10.0" "$(location cuda/lib/libcublas.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcusolver.so.10.0" "$(location cuda/lib/libcusolver.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcurand.so.10.0" "$(location cuda/lib/libcurand.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcufft.so.10.0" "$(location cuda/lib/libcufft.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcudnn.so.7" "$(location cuda/lib/libcudnn.so.7)" && \
-cp -f "/usr/local/cuda-10.0/extras/CUPTI/lib64/libcupti.so.10.0" "$(location cuda/lib/libcupti.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcusparse.so.10.0" "$(location cuda/lib/libcusparse.so.10.0)" """,
-)
-
-genrule(
-    name = "cuda-bin",
-    outs = [
-        "cuda/bin/bin2c",
-        "cuda/bin/crt/link.stub",
-        "cuda/bin/crt/prelink.stub",
-        "cuda/bin/cuda-gdb",
-        "cuda/bin/cuda-gdbserver",
-        "cuda/bin/cuda-memcheck",
-        "cuda/bin/cudafe++",
-        "cuda/bin/cuobjdump",
-        "cuda/bin/fatbinary",
-        "cuda/bin/gpu-library-advisor",
-        "cuda/bin/nvcc",
-        "cuda/bin/nvcc.profile",
-        "cuda/bin/nvdisasm",
-        "cuda/bin/nvlink",
-        "cuda/bin/nvprof",
-        "cuda/bin/nvprune",
-        "cuda/bin/ptxas",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.0/bin/." "$(@D)/cuda/bin/" """,
-)
-
-genrule(
-    name = "cudnn-include",
-    outs = [
-        "cudnn/include/cudnn.h",
-    ],
-    cmd = """cp -f "/usr/local/cuda-10.0/include/cudnn.h" "$(location cudnn/include/cudnn.h)" """,
-)
diff --git a/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/build_defs.bzl b/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/build_defs.bzl
deleted file mode 100755
index 72472e4c224..00000000000
--- a/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/build_defs.bzl
+++ /dev/null
@@ -1,64 +0,0 @@
-# Macros for building CUDA code.
-def if_cuda(if_true, if_false = []):
-    """Shorthand for select()'ing on whether we're building with CUDA.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with CUDA enabled.  Otherwise, the select statement evaluates to if_false.
-
-    """
-    return select({
-        "@local_config_cuda//cuda:using_nvcc": if_true,
-        "@local_config_cuda//cuda:using_clang": if_true,
-        "//conditions:default": if_false,
-    })
-
-def cuda_default_copts():
-    """Default options for all CUDA compilations."""
-    return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + [])
-
-def cuda_is_configured():
-    """Returns true if CUDA was enabled during the configure process."""
-    return True
-
-def if_cuda_is_configured(x):
-    """Tests if the CUDA was enabled during the configure process.
-
-    Unlike if_cuda(), this does not require that we are building with
-    --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries.
-    """
-    if cuda_is_configured():
-        return select({"//conditions:default": x})
-    return select({"//conditions:default": []})
-
-def cuda_header_library(
-        name,
-        hdrs,
-        include_prefix = None,
-        strip_include_prefix = None,
-        deps = [],
-        **kwargs):
-    """Generates a cc_library containing both virtual and system include paths.
-
-    Generates both a header-only target with virtual includes plus the full
-    target without virtual includes. This works around the fact that bazel can't
-    mix 'includes' and 'include_prefix' in the same target."""
-
-    native.cc_library(
-        name = name + "_virtual",
-        hdrs = hdrs,
-        include_prefix = include_prefix,
-        strip_include_prefix = strip_include_prefix,
-        deps = deps,
-        visibility = ["//visibility:private"],
-    )
-
-    native.cc_library(
-        name = name,
-        textual_hdrs = hdrs,
-        deps = deps + [":%s_virtual" % name],
-        **kwargs
-    )
-
-def cuda_library(copts = [], **kwargs):
-    """Wrapper over cc_library which adds default CUDA options."""
-    native.cc_library(copts = cuda_default_copts() + copts, **kwargs)
diff --git a/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/cuda/cuda_config.h b/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
deleted file mode 100755
index 72a7cf77346..00000000000
--- a/third_party/toolchains/preconfig/centos6/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef CUDA_CUDA_CONFIG_H_
-#define CUDA_CUDA_CONFIG_H_
-
-#define TF_CUDA_CAPABILITIES CudaVersion("3.0"), CudaVersion("6.0")
-
-#define TF_CUDA_VERSION "10.0"
-#define TF_CUDA_LIB_VERSION "10.0"
-#define TF_CUDNN_VERSION "7"
-
-#define TF_CUDA_TOOLKIT_PATH "/usr/local/cuda-10.0"
-
-#endif  // CUDA_CUDA_CONFIG_H_
diff --git a/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/WORKSPACE b/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/WORKSPACE
deleted file mode 100644
index b61f572d6d2..00000000000
--- a/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for cuda_configure rule
-workspace(name = "local_config_cuda")
diff --git a/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/BUILD b/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/BUILD
deleted file mode 100755
index f64204c4920..00000000000
--- a/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/BUILD
+++ /dev/null
@@ -1,1357 +0,0 @@
-load(":build_defs.bzl", "cuda_header_library")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
-
-package(default_visibility = ["//visibility:public"])
-
-config_setting(
-    name = "using_nvcc",
-    values = {
-        "define": "using_cuda_nvcc=true",
-    },
-)
-
-config_setting(
-    name = "using_clang",
-    values = {
-        "define": "using_cuda_clang=true",
-    },
-)
-
-# Equivalent to using_clang && -c opt.
-config_setting(
-    name = "using_clang_opt",
-    values = {
-        "define": "using_cuda_clang=true",
-        "compilation_mode": "opt",
-    },
-)
-
-config_setting(
-    name = "darwin",
-    values = {"cpu": "darwin"},
-)
-
-config_setting(
-    name = "freebsd",
-    values = {"cpu": "freebsd"},
-)
-
-cuda_header_library(
-    name = "cuda_headers",
-    hdrs = [
-        "cuda/cuda_config.h",
-        ":cuda-include",
-    ],
-    include_prefix = "third_party/gpus",
-    includes = [
-        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
-        "cuda/include",
-    ],
-)
-
-cc_library(
-    name = "cudart_static",
-    srcs = ["cuda/lib/libcudart_static.a"],
-    linkopts = select({
-        ":freebsd": [],
-        "//conditions:default": ["-ldl"],
-    }) + [
-        "-lpthread",
-        "-lrt",
-    ],
-)
-
-cc_library(
-    name = "cuda_driver",
-    srcs = ["cuda/lib/libcuda.so"],
-)
-
-cc_library(
-    name = "cudart",
-    srcs = ["cuda/lib/libcudart.so.10.1"],
-    data = ["cuda/lib/libcudart.so.10.1"],
-    linkstatic = 1,
-)
-
-cuda_header_library(
-    name = "cublas_headers",
-    hdrs = [":cublas-include"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["cublas/include"],
-    strip_include_prefix = "cublas/include",
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cublas",
-    srcs = ["cuda/lib/libcublas.so.10"],
-    data = ["cuda/lib/libcublas.so.10"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cusolver",
-    srcs = ["cuda/lib/libcusolver.so.10"],
-    data = ["cuda/lib/libcusolver.so.10"],
-    linkopts = ["-lgomp"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cudnn",
-    srcs = ["cuda/lib/libcudnn.so.7"],
-    data = ["cuda/lib/libcudnn.so.7"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cudnn_header",
-    hdrs = [":cudnn-include"],
-    include_prefix = "third_party/gpus/cudnn",
-    strip_include_prefix = "cudnn/include",
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cufft",
-    srcs = ["cuda/lib/libcufft.so.10"],
-    data = ["cuda/lib/libcufft.so.10"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "curand",
-    srcs = ["cuda/lib/libcurand.so.10"],
-    data = ["cuda/lib/libcurand.so.10"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cuda",
-    deps = [
-        ":cublas",
-        ":cuda_headers",
-        ":cudart",
-        ":cudnn",
-        ":cufft",
-        ":curand",
-    ],
-)
-
-cuda_header_library(
-    name = "cupti_headers",
-    hdrs = [":cuda-extras"],
-    include_prefix = "third_party/gpus",
-    includes = ["cuda/extras/CUPTI/include/"],
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cupti_dsos",
-    data = ["cuda/lib/libcupti.so.10.1"],
-)
-
-cc_library(
-    name = "cusparse",
-    srcs = ["cuda/lib/libcusparse.so.10"],
-    data = ["cuda/lib/libcusparse.so.10"],
-    linkopts = ["-lgomp"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "libdevice_root",
-    data = [":cuda-nvvm"],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    deps = [
-        "@bazel_skylib//lib:selects",
-    ],
-)
-
-genrule(
-    name = "cuda-include",
-    outs = [
-        "cuda/include/CL/cl.h",
-        "cuda/include/CL/cl.hpp",
-        "cuda/include/CL/cl_egl.h",
-        "cuda/include/CL/cl_ext.h",
-        "cuda/include/CL/cl_gl.h",
-        "cuda/include/CL/cl_gl_ext.h",
-        "cuda/include/CL/cl_platform.h",
-        "cuda/include/CL/opencl.h",
-        "cuda/include/builtin_types.h",
-        "cuda/include/channel_descriptor.h",
-        "cuda/include/common_functions.h",
-        "cuda/include/cooperative_groups.h",
-        "cuda/include/cooperative_groups_helpers.h",
-        "cuda/include/crt/common_functions.h",
-        "cuda/include/crt/device_double_functions.h",
-        "cuda/include/crt/device_double_functions.hpp",
-        "cuda/include/crt/device_functions.h",
-        "cuda/include/crt/device_functions.hpp",
-        "cuda/include/crt/func_macro.h",
-        "cuda/include/crt/host_config.h",
-        "cuda/include/crt/host_defines.h",
-        "cuda/include/crt/host_runtime.h",
-        "cuda/include/crt/math_functions.h",
-        "cuda/include/crt/math_functions.hpp",
-        "cuda/include/crt/mma.h",
-        "cuda/include/crt/mma.hpp",
-        "cuda/include/crt/nvfunctional",
-        "cuda/include/crt/sm_70_rt.h",
-        "cuda/include/crt/sm_70_rt.hpp",
-        "cuda/include/crt/storage_class.h",
-        "cuda/include/cuComplex.h",
-        "cuda/include/cuda.h",
-        "cuda/include/cudaEGL.h",
-        "cuda/include/cudaGL.h",
-        "cuda/include/cudaProfiler.h",
-        "cuda/include/cudaVDPAU.h",
-        "cuda/include/cuda_device_runtime_api.h",
-        "cuda/include/cuda_egl_interop.h",
-        "cuda/include/cuda_fp16.h",
-        "cuda/include/cuda_fp16.hpp",
-        "cuda/include/cuda_gl_interop.h",
-        "cuda/include/cuda_occupancy.h",
-        "cuda/include/cuda_profiler_api.h",
-        "cuda/include/cuda_runtime.h",
-        "cuda/include/cuda_runtime_api.h",
-        "cuda/include/cuda_surface_types.h",
-        "cuda/include/cuda_texture_types.h",
-        "cuda/include/cuda_vdpau_interop.h",
-        "cuda/include/cudalibxt.h",
-        "cuda/include/cudart_platform.h",
-        "cuda/include/cudnn.h",
-        "cuda/include/cufft.h",
-        "cuda/include/cufftXt.h",
-        "cuda/include/cufftw.h",
-        "cuda/include/curand.h",
-        "cuda/include/curand_discrete.h",
-        "cuda/include/curand_discrete2.h",
-        "cuda/include/curand_globals.h",
-        "cuda/include/curand_kernel.h",
-        "cuda/include/curand_lognormal.h",
-        "cuda/include/curand_mrg32k3a.h",
-        "cuda/include/curand_mtgp32.h",
-        "cuda/include/curand_mtgp32_host.h",
-        "cuda/include/curand_mtgp32_kernel.h",
-        "cuda/include/curand_mtgp32dc_p_11213.h",
-        "cuda/include/curand_normal.h",
-        "cuda/include/curand_normal_static.h",
-        "cuda/include/curand_philox4x32_x.h",
-        "cuda/include/curand_poisson.h",
-        "cuda/include/curand_precalc.h",
-        "cuda/include/curand_uniform.h",
-        "cuda/include/cusolverDn.h",
-        "cuda/include/cusolverRf.h",
-        "cuda/include/cusolverSp.h",
-        "cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h",
-        "cuda/include/cusolver_common.h",
-        "cuda/include/cusparse.h",
-        "cuda/include/cusparse_v2.h",
-        "cuda/include/device_atomic_functions.h",
-        "cuda/include/device_atomic_functions.hpp",
-        "cuda/include/device_double_functions.h",
-        "cuda/include/device_functions.h",
-        "cuda/include/device_launch_parameters.h",
-        "cuda/include/device_types.h",
-        "cuda/include/driver_functions.h",
-        "cuda/include/driver_types.h",
-        "cuda/include/fatBinaryCtl.h",
-        "cuda/include/fatbinary.h",
-        "cuda/include/fatbinary_section.h",
-        "cuda/include/host_config.h",
-        "cuda/include/host_defines.h",
-        "cuda/include/library_types.h",
-        "cuda/include/math_constants.h",
-        "cuda/include/math_functions.h",
-        "cuda/include/mma.h",
-        "cuda/include/npp.h",
-        "cuda/include/nppcore.h",
-        "cuda/include/nppdefs.h",
-        "cuda/include/nppi.h",
-        "cuda/include/nppi_arithmetic_and_logical_operations.h",
-        "cuda/include/nppi_color_conversion.h",
-        "cuda/include/nppi_compression_functions.h",
-        "cuda/include/nppi_computer_vision.h",
-        "cuda/include/nppi_data_exchange_and_initialization.h",
-        "cuda/include/nppi_filtering_functions.h",
-        "cuda/include/nppi_geometry_transforms.h",
-        "cuda/include/nppi_linear_transforms.h",
-        "cuda/include/nppi_morphological_operations.h",
-        "cuda/include/nppi_statistics_functions.h",
-        "cuda/include/nppi_support_functions.h",
-        "cuda/include/nppi_threshold_and_compare_operations.h",
-        "cuda/include/npps.h",
-        "cuda/include/npps_arithmetic_and_logical_operations.h",
-        "cuda/include/npps_conversion_functions.h",
-        "cuda/include/npps_filtering_functions.h",
-        "cuda/include/npps_initialization.h",
-        "cuda/include/npps_statistics_functions.h",
-        "cuda/include/npps_support_functions.h",
-        "cuda/include/nppversion.h",
-        "cuda/include/nvToolsExt.h",
-        "cuda/include/nvToolsExtCuda.h",
-        "cuda/include/nvToolsExtCudaRt.h",
-        "cuda/include/nvToolsExtMeta.h",
-        "cuda/include/nvToolsExtSync.h",
-        "cuda/include/nvfunctional",
-        "cuda/include/nvgraph.h",
-        "cuda/include/nvjpeg.h",
-        "cuda/include/nvml.h",
-        "cuda/include/nvrtc.h",
-        "cuda/include/nvtx3/nvToolsExt.h",
-        "cuda/include/nvtx3/nvToolsExtCuda.h",
-        "cuda/include/nvtx3/nvToolsExtCudaRt.h",
-        "cuda/include/nvtx3/nvToolsExtOpenCL.h",
-        "cuda/include/nvtx3/nvToolsExtSync.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImpl.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplCore.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxInit.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxInitDecls.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxInitDefs.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxLinkOnce.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxTypes.h",
-        "cuda/include/sm_20_atomic_functions.h",
-        "cuda/include/sm_20_atomic_functions.hpp",
-        "cuda/include/sm_20_intrinsics.h",
-        "cuda/include/sm_20_intrinsics.hpp",
-        "cuda/include/sm_30_intrinsics.h",
-        "cuda/include/sm_30_intrinsics.hpp",
-        "cuda/include/sm_32_atomic_functions.h",
-        "cuda/include/sm_32_atomic_functions.hpp",
-        "cuda/include/sm_32_intrinsics.h",
-        "cuda/include/sm_32_intrinsics.hpp",
-        "cuda/include/sm_35_atomic_functions.h",
-        "cuda/include/sm_35_intrinsics.h",
-        "cuda/include/sm_60_atomic_functions.h",
-        "cuda/include/sm_60_atomic_functions.hpp",
-        "cuda/include/sm_61_intrinsics.h",
-        "cuda/include/sm_61_intrinsics.hpp",
-        "cuda/include/sobol_direction_vectors.h",
-        "cuda/include/surface_functions.h",
-        "cuda/include/surface_functions.hpp",
-        "cuda/include/surface_indirect_functions.h",
-        "cuda/include/surface_indirect_functions.hpp",
-        "cuda/include/surface_types.h",
-        "cuda/include/texture_fetch_functions.h",
-        "cuda/include/texture_fetch_functions.hpp",
-        "cuda/include/texture_indirect_functions.h",
-        "cuda/include/texture_indirect_functions.hpp",
-        "cuda/include/texture_types.h",
-        "cuda/include/thrust/addressof.h",
-        "cuda/include/thrust/adjacent_difference.h",
-        "cuda/include/thrust/advance.h",
-        "cuda/include/thrust/allocate_unique.h",
-        "cuda/include/thrust/async/copy.h",
-        "cuda/include/thrust/async/for_each.h",
-        "cuda/include/thrust/async/reduce.h",
-        "cuda/include/thrust/async/sort.h",
-        "cuda/include/thrust/async/transform.h",
-        "cuda/include/thrust/binary_search.h",
-        "cuda/include/thrust/complex.h",
-        "cuda/include/thrust/copy.h",
-        "cuda/include/thrust/count.h",
-        "cuda/include/thrust/detail/adjacent_difference.inl",
-        "cuda/include/thrust/detail/advance.inl",
-        "cuda/include/thrust/detail/alignment.h",
-        "cuda/include/thrust/detail/allocator/allocator_traits.h",
-        "cuda/include/thrust/detail/allocator/allocator_traits.inl",
-        "cuda/include/thrust/detail/allocator/copy_construct_range.h",
-        "cuda/include/thrust/detail/allocator/copy_construct_range.inl",
-        "cuda/include/thrust/detail/allocator/default_construct_range.h",
-        "cuda/include/thrust/detail/allocator/default_construct_range.inl",
-        "cuda/include/thrust/detail/allocator/destroy_range.h",
-        "cuda/include/thrust/detail/allocator/destroy_range.inl",
-        "cuda/include/thrust/detail/allocator/fill_construct_range.h",
-        "cuda/include/thrust/detail/allocator/fill_construct_range.inl",
-        "cuda/include/thrust/detail/allocator/malloc_allocator.h",
-        "cuda/include/thrust/detail/allocator/malloc_allocator.inl",
-        "cuda/include/thrust/detail/allocator/no_throw_allocator.h",
-        "cuda/include/thrust/detail/allocator/tagged_allocator.h",
-        "cuda/include/thrust/detail/allocator/tagged_allocator.inl",
-        "cuda/include/thrust/detail/allocator/temporary_allocator.h",
-        "cuda/include/thrust/detail/allocator/temporary_allocator.inl",
-        "cuda/include/thrust/detail/allocator_aware_execution_policy.h",
-        "cuda/include/thrust/detail/binary_search.inl",
-        "cuda/include/thrust/detail/complex/arithmetic.h",
-        "cuda/include/thrust/detail/complex/c99math.h",
-        "cuda/include/thrust/detail/complex/catrig.h",
-        "cuda/include/thrust/detail/complex/catrigf.h",
-        "cuda/include/thrust/detail/complex/ccosh.h",
-        "cuda/include/thrust/detail/complex/ccoshf.h",
-        "cuda/include/thrust/detail/complex/cexp.h",
-        "cuda/include/thrust/detail/complex/cexpf.h",
-        "cuda/include/thrust/detail/complex/clog.h",
-        "cuda/include/thrust/detail/complex/clogf.h",
-        "cuda/include/thrust/detail/complex/complex.inl",
-        "cuda/include/thrust/detail/complex/cpow.h",
-        "cuda/include/thrust/detail/complex/cproj.h",
-        "cuda/include/thrust/detail/complex/csinh.h",
-        "cuda/include/thrust/detail/complex/csinhf.h",
-        "cuda/include/thrust/detail/complex/csqrt.h",
-        "cuda/include/thrust/detail/complex/csqrtf.h",
-        "cuda/include/thrust/detail/complex/ctanh.h",
-        "cuda/include/thrust/detail/complex/ctanhf.h",
-        "cuda/include/thrust/detail/complex/math_private.h",
-        "cuda/include/thrust/detail/complex/stream.h",
-        "cuda/include/thrust/detail/config.h",
-        "cuda/include/thrust/detail/config/compiler.h",
-        "cuda/include/thrust/detail/config/compiler_fence.h",
-        "cuda/include/thrust/detail/config/config.h",
-        "cuda/include/thrust/detail/config/cpp_compatibility.h",
-        "cuda/include/thrust/detail/config/cpp_dialect.h",
-        "cuda/include/thrust/detail/config/debug.h",
-        "cuda/include/thrust/detail/config/device_system.h",
-        "cuda/include/thrust/detail/config/exec_check_disable.h",
-        "cuda/include/thrust/detail/config/forceinline.h",
-        "cuda/include/thrust/detail/config/global_workarounds.h",
-        "cuda/include/thrust/detail/config/host_device.h",
-        "cuda/include/thrust/detail/config/host_system.h",
-        "cuda/include/thrust/detail/config/simple_defines.h",
-        "cuda/include/thrust/detail/contiguous_storage.h",
-        "cuda/include/thrust/detail/contiguous_storage.inl",
-        "cuda/include/thrust/detail/copy.h",
-        "cuda/include/thrust/detail/copy.inl",
-        "cuda/include/thrust/detail/copy_if.h",
-        "cuda/include/thrust/detail/copy_if.inl",
-        "cuda/include/thrust/detail/count.inl",
-        "cuda/include/thrust/detail/cpp11_required.h",
-        "cuda/include/thrust/detail/cstdint.h",
-        "cuda/include/thrust/detail/dependencies_aware_execution_policy.h",
-        "cuda/include/thrust/detail/device_delete.inl",
-        "cuda/include/thrust/detail/device_free.inl",
-        "cuda/include/thrust/detail/device_malloc.inl",
-        "cuda/include/thrust/detail/device_new.inl",
-        "cuda/include/thrust/detail/device_ptr.inl",
-        "cuda/include/thrust/detail/device_reference.inl",
-        "cuda/include/thrust/detail/device_vector.inl",
-        "cuda/include/thrust/detail/distance.inl",
-        "cuda/include/thrust/detail/equal.inl",
-        "cuda/include/thrust/detail/event_error.h",
-        "cuda/include/thrust/detail/execute_with_allocator.h",
-        "cuda/include/thrust/detail/execute_with_allocator_fwd.h",
-        "cuda/include/thrust/detail/execute_with_dependencies.h",
-        "cuda/include/thrust/detail/execution_policy.h",
-        "cuda/include/thrust/detail/extrema.inl",
-        "cuda/include/thrust/detail/fill.inl",
-        "cuda/include/thrust/detail/find.inl",
-        "cuda/include/thrust/detail/for_each.inl",
-        "cuda/include/thrust/detail/function.h",
-        "cuda/include/thrust/detail/functional.inl",
-        "cuda/include/thrust/detail/functional/actor.h",
-        "cuda/include/thrust/detail/functional/actor.inl",
-        "cuda/include/thrust/detail/functional/argument.h",
-        "cuda/include/thrust/detail/functional/composite.h",
-        "cuda/include/thrust/detail/functional/operators.h",
-        "cuda/include/thrust/detail/functional/operators/arithmetic_operators.h",
-        "cuda/include/thrust/detail/functional/operators/assignment_operator.h",
-        "cuda/include/thrust/detail/functional/operators/bitwise_operators.h",
-        "cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h",
-        "cuda/include/thrust/detail/functional/operators/logical_operators.h",
-        "cuda/include/thrust/detail/functional/operators/operator_adaptors.h",
-        "cuda/include/thrust/detail/functional/operators/relational_operators.h",
-        "cuda/include/thrust/detail/functional/placeholder.h",
-        "cuda/include/thrust/detail/functional/value.h",
-        "cuda/include/thrust/detail/gather.inl",
-        "cuda/include/thrust/detail/generate.inl",
-        "cuda/include/thrust/detail/get_iterator_value.h",
-        "cuda/include/thrust/detail/host_vector.inl",
-        "cuda/include/thrust/detail/inner_product.inl",
-        "cuda/include/thrust/detail/integer_math.h",
-        "cuda/include/thrust/detail/integer_traits.h",
-        "cuda/include/thrust/detail/internal_functional.h",
-        "cuda/include/thrust/detail/logical.inl",
-        "cuda/include/thrust/detail/malloc_and_free.h",
-        "cuda/include/thrust/detail/memory_algorithms.h",
-        "cuda/include/thrust/detail/merge.inl",
-        "cuda/include/thrust/detail/minmax.h",
-        "cuda/include/thrust/detail/mismatch.inl",
-        "cuda/include/thrust/detail/modern_gcc_required.h",
-        "cuda/include/thrust/detail/mpl/math.h",
-        "cuda/include/thrust/detail/numeric_traits.h",
-        "cuda/include/thrust/detail/overlapped_copy.h",
-        "cuda/include/thrust/detail/pair.inl",
-        "cuda/include/thrust/detail/partition.inl",
-        "cuda/include/thrust/detail/pointer.h",
-        "cuda/include/thrust/detail/pointer.inl",
-        "cuda/include/thrust/detail/preprocessor.h",
-        "cuda/include/thrust/detail/range/head_flags.h",
-        "cuda/include/thrust/detail/range/tail_flags.h",
-        "cuda/include/thrust/detail/raw_pointer_cast.h",
-        "cuda/include/thrust/detail/raw_reference_cast.h",
-        "cuda/include/thrust/detail/reduce.inl",
-        "cuda/include/thrust/detail/reference.h",
-        "cuda/include/thrust/detail/reference.inl",
-        "cuda/include/thrust/detail/reference_forward_declaration.h",
-        "cuda/include/thrust/detail/remove.inl",
-        "cuda/include/thrust/detail/replace.inl",
-        "cuda/include/thrust/detail/reverse.inl",
-        "cuda/include/thrust/detail/scan.inl",
-        "cuda/include/thrust/detail/scatter.inl",
-        "cuda/include/thrust/detail/select_system.h",
-        "cuda/include/thrust/detail/seq.h",
-        "cuda/include/thrust/detail/sequence.inl",
-        "cuda/include/thrust/detail/set_operations.inl",
-        "cuda/include/thrust/detail/sort.inl",
-        "cuda/include/thrust/detail/static_assert.h",
-        "cuda/include/thrust/detail/static_map.h",
-        "cuda/include/thrust/detail/swap.h",
-        "cuda/include/thrust/detail/swap.inl",
-        "cuda/include/thrust/detail/swap_ranges.inl",
-        "cuda/include/thrust/detail/tabulate.inl",
-        "cuda/include/thrust/detail/temporary_array.h",
-        "cuda/include/thrust/detail/temporary_array.inl",
-        "cuda/include/thrust/detail/temporary_buffer.h",
-        "cuda/include/thrust/detail/transform.inl",
-        "cuda/include/thrust/detail/transform_reduce.inl",
-        "cuda/include/thrust/detail/transform_scan.inl",
-        "cuda/include/thrust/detail/trivial_sequence.h",
-        "cuda/include/thrust/detail/tuple.inl",
-        "cuda/include/thrust/detail/tuple_algorithms.h",
-        "cuda/include/thrust/detail/tuple_meta_transform.h",
-        "cuda/include/thrust/detail/tuple_transform.h",
-        "cuda/include/thrust/detail/type_deduction.h",
-        "cuda/include/thrust/detail/type_traits.h",
-        "cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h",
-        "cuda/include/thrust/detail/type_traits/function_traits.h",
-        "cuda/include/thrust/detail/type_traits/has_member_function.h",
-        "cuda/include/thrust/detail/type_traits/has_nested_type.h",
-        "cuda/include/thrust/detail/type_traits/has_trivial_assign.h",
-        "cuda/include/thrust/detail/type_traits/is_call_possible.h",
-        "cuda/include/thrust/detail/type_traits/is_metafunction_defined.h",
-        "cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h",
-        "cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h",
-        "cuda/include/thrust/detail/type_traits/minimum_type.h",
-        "cuda/include/thrust/detail/type_traits/pointer_traits.h",
-        "cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h",
-        "cuda/include/thrust/detail/uninitialized_copy.inl",
-        "cuda/include/thrust/detail/uninitialized_fill.inl",
-        "cuda/include/thrust/detail/unique.inl",
-        "cuda/include/thrust/detail/use_default.h",
-        "cuda/include/thrust/detail/util/align.h",
-        "cuda/include/thrust/detail/util/blocking.h",
-        "cuda/include/thrust/detail/vector_base.h",
-        "cuda/include/thrust/detail/vector_base.inl",
-        "cuda/include/thrust/device_allocator.h",
-        "cuda/include/thrust/device_delete.h",
-        "cuda/include/thrust/device_free.h",
-        "cuda/include/thrust/device_make_unique.h",
-        "cuda/include/thrust/device_malloc.h",
-        "cuda/include/thrust/device_malloc_allocator.h",
-        "cuda/include/thrust/device_new.h",
-        "cuda/include/thrust/device_new_allocator.h",
-        "cuda/include/thrust/device_ptr.h",
-        "cuda/include/thrust/device_reference.h",
-        "cuda/include/thrust/device_vector.h",
-        "cuda/include/thrust/distance.h",
-        "cuda/include/thrust/equal.h",
-        "cuda/include/thrust/event.h",
-        "cuda/include/thrust/execution_policy.h",
-        "cuda/include/thrust/extrema.h",
-        "cuda/include/thrust/fill.h",
-        "cuda/include/thrust/find.h",
-        "cuda/include/thrust/for_each.h",
-        "cuda/include/thrust/functional.h",
-        "cuda/include/thrust/future.h",
-        "cuda/include/thrust/gather.h",
-        "cuda/include/thrust/generate.h",
-        "cuda/include/thrust/host_vector.h",
-        "cuda/include/thrust/inner_product.h",
-        "cuda/include/thrust/iterator/constant_iterator.h",
-        "cuda/include/thrust/iterator/counting_iterator.h",
-        "cuda/include/thrust/iterator/detail/any_assign.h",
-        "cuda/include/thrust/iterator/detail/any_system_tag.h",
-        "cuda/include/thrust/iterator/detail/constant_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/counting_iterator.inl",
-        "cuda/include/thrust/iterator/detail/device_system_tag.h",
-        "cuda/include/thrust/iterator/detail/discard_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/distance_from_result.h",
-        "cuda/include/thrust/iterator/detail/host_system_tag.h",
-        "cuda/include/thrust/iterator/detail/is_iterator_category.h",
-        "cuda/include/thrust/iterator/detail/iterator_adaptor_base.h",
-        "cuda/include/thrust/iterator/detail/iterator_category_to_system.h",
-        "cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h",
-        "cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h",
-        "cuda/include/thrust/iterator/detail/iterator_facade_category.h",
-        "cuda/include/thrust/iterator/detail/iterator_traits.inl",
-        "cuda/include/thrust/iterator/detail/iterator_traversal_tags.h",
-        "cuda/include/thrust/iterator/detail/join_iterator.h",
-        "cuda/include/thrust/iterator/detail/minimum_category.h",
-        "cuda/include/thrust/iterator/detail/minimum_system.h",
-        "cuda/include/thrust/iterator/detail/normal_iterator.h",
-        "cuda/include/thrust/iterator/detail/permutation_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/retag.h",
-        "cuda/include/thrust/iterator/detail/reverse_iterator.inl",
-        "cuda/include/thrust/iterator/detail/reverse_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/tagged_iterator.h",
-        "cuda/include/thrust/iterator/detail/transform_iterator.inl",
-        "cuda/include/thrust/iterator/detail/transform_output_iterator.inl",
-        "cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h",
-        "cuda/include/thrust/iterator/detail/universal_categories.h",
-        "cuda/include/thrust/iterator/detail/zip_iterator.inl",
-        "cuda/include/thrust/iterator/detail/zip_iterator_base.h",
-        "cuda/include/thrust/iterator/discard_iterator.h",
-        "cuda/include/thrust/iterator/iterator_adaptor.h",
-        "cuda/include/thrust/iterator/iterator_categories.h",
-        "cuda/include/thrust/iterator/iterator_facade.h",
-        "cuda/include/thrust/iterator/iterator_traits.h",
-        "cuda/include/thrust/iterator/permutation_iterator.h",
-        "cuda/include/thrust/iterator/retag.h",
-        "cuda/include/thrust/iterator/reverse_iterator.h",
-        "cuda/include/thrust/iterator/transform_iterator.h",
-        "cuda/include/thrust/iterator/transform_output_iterator.h",
-        "cuda/include/thrust/iterator/zip_iterator.h",
-        "cuda/include/thrust/limits.h",
-        "cuda/include/thrust/logical.h",
-        "cuda/include/thrust/memory.h",
-        "cuda/include/thrust/memory/detail/device_system_resource.h",
-        "cuda/include/thrust/memory/detail/host_system_resource.h",
-        "cuda/include/thrust/merge.h",
-        "cuda/include/thrust/mismatch.h",
-        "cuda/include/thrust/mr/allocator.h",
-        "cuda/include/thrust/mr/detail/config.h",
-        "cuda/include/thrust/mr/disjoint_pool.h",
-        "cuda/include/thrust/mr/disjoint_sync_pool.h",
-        "cuda/include/thrust/mr/disjoint_tls_pool.h",
-        "cuda/include/thrust/mr/fancy_pointer_resource.h",
-        "cuda/include/thrust/mr/memory_resource.h",
-        "cuda/include/thrust/mr/new.h",
-        "cuda/include/thrust/mr/polymorphic_adaptor.h",
-        "cuda/include/thrust/mr/pool.h",
-        "cuda/include/thrust/mr/pool_options.h",
-        "cuda/include/thrust/mr/sync_pool.h",
-        "cuda/include/thrust/mr/tls_pool.h",
-        "cuda/include/thrust/mr/validator.h",
-        "cuda/include/thrust/optional.h",
-        "cuda/include/thrust/pair.h",
-        "cuda/include/thrust/partition.h",
-        "cuda/include/thrust/per_device_resource.h",
-        "cuda/include/thrust/random.h",
-        "cuda/include/thrust/random/detail/discard_block_engine.inl",
-        "cuda/include/thrust/random/detail/linear_congruential_engine.inl",
-        "cuda/include/thrust/random/detail/linear_congruential_engine_discard.h",
-        "cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl",
-        "cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h",
-        "cuda/include/thrust/random/detail/mod.h",
-        "cuda/include/thrust/random/detail/normal_distribution.inl",
-        "cuda/include/thrust/random/detail/normal_distribution_base.h",
-        "cuda/include/thrust/random/detail/random_core_access.h",
-        "cuda/include/thrust/random/detail/subtract_with_carry_engine.inl",
-        "cuda/include/thrust/random/detail/uniform_int_distribution.inl",
-        "cuda/include/thrust/random/detail/uniform_real_distribution.inl",
-        "cuda/include/thrust/random/detail/xor_combine_engine.inl",
-        "cuda/include/thrust/random/detail/xor_combine_engine_max.h",
-        "cuda/include/thrust/random/discard_block_engine.h",
-        "cuda/include/thrust/random/linear_congruential_engine.h",
-        "cuda/include/thrust/random/linear_feedback_shift_engine.h",
-        "cuda/include/thrust/random/normal_distribution.h",
-        "cuda/include/thrust/random/subtract_with_carry_engine.h",
-        "cuda/include/thrust/random/uniform_int_distribution.h",
-        "cuda/include/thrust/random/uniform_real_distribution.h",
-        "cuda/include/thrust/random/xor_combine_engine.h",
-        "cuda/include/thrust/reduce.h",
-        "cuda/include/thrust/remove.h",
-        "cuda/include/thrust/replace.h",
-        "cuda/include/thrust/reverse.h",
-        "cuda/include/thrust/scan.h",
-        "cuda/include/thrust/scatter.h",
-        "cuda/include/thrust/sequence.h",
-        "cuda/include/thrust/set_operations.h",
-        "cuda/include/thrust/sort.h",
-        "cuda/include/thrust/swap.h",
-        "cuda/include/thrust/system/cpp/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/cpp/detail/assign_value.h",
-        "cuda/include/thrust/system/cpp/detail/binary_search.h",
-        "cuda/include/thrust/system/cpp/detail/copy.h",
-        "cuda/include/thrust/system/cpp/detail/copy_if.h",
-        "cuda/include/thrust/system/cpp/detail/count.h",
-        "cuda/include/thrust/system/cpp/detail/equal.h",
-        "cuda/include/thrust/system/cpp/detail/execution_policy.h",
-        "cuda/include/thrust/system/cpp/detail/extrema.h",
-        "cuda/include/thrust/system/cpp/detail/fill.h",
-        "cuda/include/thrust/system/cpp/detail/find.h",
-        "cuda/include/thrust/system/cpp/detail/for_each.h",
-        "cuda/include/thrust/system/cpp/detail/gather.h",
-        "cuda/include/thrust/system/cpp/detail/generate.h",
-        "cuda/include/thrust/system/cpp/detail/get_value.h",
-        "cuda/include/thrust/system/cpp/detail/inner_product.h",
-        "cuda/include/thrust/system/cpp/detail/iter_swap.h",
-        "cuda/include/thrust/system/cpp/detail/logical.h",
-        "cuda/include/thrust/system/cpp/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/cpp/detail/memory.inl",
-        "cuda/include/thrust/system/cpp/detail/merge.h",
-        "cuda/include/thrust/system/cpp/detail/mismatch.h",
-        "cuda/include/thrust/system/cpp/detail/par.h",
-        "cuda/include/thrust/system/cpp/detail/partition.h",
-        "cuda/include/thrust/system/cpp/detail/per_device_resource.h",
-        "cuda/include/thrust/system/cpp/detail/pointer.inl",
-        "cuda/include/thrust/system/cpp/detail/reduce.h",
-        "cuda/include/thrust/system/cpp/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/cpp/detail/remove.h",
-        "cuda/include/thrust/system/cpp/detail/replace.h",
-        "cuda/include/thrust/system/cpp/detail/reverse.h",
-        "cuda/include/thrust/system/cpp/detail/scan.h",
-        "cuda/include/thrust/system/cpp/detail/scan_by_key.h",
-        "cuda/include/thrust/system/cpp/detail/scatter.h",
-        "cuda/include/thrust/system/cpp/detail/sequence.h",
-        "cuda/include/thrust/system/cpp/detail/set_operations.h",
-        "cuda/include/thrust/system/cpp/detail/sort.h",
-        "cuda/include/thrust/system/cpp/detail/swap_ranges.h",
-        "cuda/include/thrust/system/cpp/detail/tabulate.h",
-        "cuda/include/thrust/system/cpp/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/cpp/detail/transform.h",
-        "cuda/include/thrust/system/cpp/detail/transform_reduce.h",
-        "cuda/include/thrust/system/cpp/detail/transform_scan.h",
-        "cuda/include/thrust/system/cpp/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/cpp/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/cpp/detail/unique.h",
-        "cuda/include/thrust/system/cpp/detail/unique_by_key.h",
-        "cuda/include/thrust/system/cpp/detail/vector.inl",
-        "cuda/include/thrust/system/cpp/execution_policy.h",
-        "cuda/include/thrust/system/cpp/memory.h",
-        "cuda/include/thrust/system/cpp/memory_resource.h",
-        "cuda/include/thrust/system/cpp/pointer.h",
-        "cuda/include/thrust/system/cpp/vector.h",
-        "cuda/include/thrust/system/cuda/config.h",
-        "cuda/include/thrust/system/cuda/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/cuda/detail/assign_value.h",
-        "cuda/include/thrust/system/cuda/detail/async/copy.h",
-        "cuda/include/thrust/system/cuda/detail/async/customization.h",
-        "cuda/include/thrust/system/cuda/detail/async/for_each.h",
-        "cuda/include/thrust/system/cuda/detail/async/reduce.h",
-        "cuda/include/thrust/system/cuda/detail/async/sort.h",
-        "cuda/include/thrust/system/cuda/detail/async/transform.h",
-        "cuda/include/thrust/system/cuda/detail/binary_search.h",
-        "cuda/include/thrust/system/cuda/detail/copy.h",
-        "cuda/include/thrust/system/cuda/detail/copy_if.h",
-        "cuda/include/thrust/system/cuda/detail/core/agent_launcher.h",
-        "cuda/include/thrust/system/cuda/detail/core/alignment.h",
-        "cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h",
-        "cuda/include/thrust/system/cuda/detail/core/util.h",
-        "cuda/include/thrust/system/cuda/detail/count.h",
-        "cuda/include/thrust/system/cuda/detail/cross_system.h",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/cub.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_device.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_type.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/equal.h",
-        "cuda/include/thrust/system/cuda/detail/error.inl",
-        "cuda/include/thrust/system/cuda/detail/execution_policy.h",
-        "cuda/include/thrust/system/cuda/detail/extrema.h",
-        "cuda/include/thrust/system/cuda/detail/fill.h",
-        "cuda/include/thrust/system/cuda/detail/find.h",
-        "cuda/include/thrust/system/cuda/detail/for_each.h",
-        "cuda/include/thrust/system/cuda/detail/future.inl",
-        "cuda/include/thrust/system/cuda/detail/gather.h",
-        "cuda/include/thrust/system/cuda/detail/generate.h",
-        "cuda/include/thrust/system/cuda/detail/get_value.h",
-        "cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h",
-        "cuda/include/thrust/system/cuda/detail/guarded_driver_types.h",
-        "cuda/include/thrust/system/cuda/detail/inner_product.h",
-        "cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h",
-        "cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h",
-        "cuda/include/thrust/system/cuda/detail/iter_swap.h",
-        "cuda/include/thrust/system/cuda/detail/logical.h",
-        "cuda/include/thrust/system/cuda/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/cuda/detail/memory.inl",
-        "cuda/include/thrust/system/cuda/detail/merge.h",
-        "cuda/include/thrust/system/cuda/detail/mismatch.h",
-        "cuda/include/thrust/system/cuda/detail/par.h",
-        "cuda/include/thrust/system/cuda/detail/par_to_seq.h",
-        "cuda/include/thrust/system/cuda/detail/parallel_for.h",
-        "cuda/include/thrust/system/cuda/detail/partition.h",
-        "cuda/include/thrust/system/cuda/detail/per_device_resource.h",
-        "cuda/include/thrust/system/cuda/detail/pointer.inl",
-        "cuda/include/thrust/system/cuda/detail/reduce.h",
-        "cuda/include/thrust/system/cuda/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/cuda/detail/remove.h",
-        "cuda/include/thrust/system/cuda/detail/replace.h",
-        "cuda/include/thrust/system/cuda/detail/reverse.h",
-        "cuda/include/thrust/system/cuda/detail/scan.h",
-        "cuda/include/thrust/system/cuda/detail/scan_by_key.h",
-        "cuda/include/thrust/system/cuda/detail/scatter.h",
-        "cuda/include/thrust/system/cuda/detail/sequence.h",
-        "cuda/include/thrust/system/cuda/detail/set_operations.h",
-        "cuda/include/thrust/system/cuda/detail/sort.h",
-        "cuda/include/thrust/system/cuda/detail/swap_ranges.h",
-        "cuda/include/thrust/system/cuda/detail/tabulate.h",
-        "cuda/include/thrust/system/cuda/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/cuda/detail/terminate.h",
-        "cuda/include/thrust/system/cuda/detail/transform.h",
-        "cuda/include/thrust/system/cuda/detail/transform_reduce.h",
-        "cuda/include/thrust/system/cuda/detail/transform_scan.h",
-        "cuda/include/thrust/system/cuda/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/cuda/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/cuda/detail/unique.h",
-        "cuda/include/thrust/system/cuda/detail/unique_by_key.h",
-        "cuda/include/thrust/system/cuda/detail/util.h",
-        "cuda/include/thrust/system/cuda/detail/vector.inl",
-        "cuda/include/thrust/system/cuda/error.h",
-        "cuda/include/thrust/system/cuda/execution_policy.h",
-        "cuda/include/thrust/system/cuda/experimental/pinned_allocator.h",
-        "cuda/include/thrust/system/cuda/future.h",
-        "cuda/include/thrust/system/cuda/memory.h",
-        "cuda/include/thrust/system/cuda/memory_resource.h",
-        "cuda/include/thrust/system/cuda/pointer.h",
-        "cuda/include/thrust/system/cuda/vector.h",
-        "cuda/include/thrust/system/detail/adl/adjacent_difference.h",
-        "cuda/include/thrust/system/detail/adl/assign_value.h",
-        "cuda/include/thrust/system/detail/adl/async/copy.h",
-        "cuda/include/thrust/system/detail/adl/async/for_each.h",
-        "cuda/include/thrust/system/detail/adl/async/reduce.h",
-        "cuda/include/thrust/system/detail/adl/async/sort.h",
-        "cuda/include/thrust/system/detail/adl/async/transform.h",
-        "cuda/include/thrust/system/detail/adl/binary_search.h",
-        "cuda/include/thrust/system/detail/adl/copy.h",
-        "cuda/include/thrust/system/detail/adl/copy_if.h",
-        "cuda/include/thrust/system/detail/adl/count.h",
-        "cuda/include/thrust/system/detail/adl/equal.h",
-        "cuda/include/thrust/system/detail/adl/extrema.h",
-        "cuda/include/thrust/system/detail/adl/fill.h",
-        "cuda/include/thrust/system/detail/adl/find.h",
-        "cuda/include/thrust/system/detail/adl/for_each.h",
-        "cuda/include/thrust/system/detail/adl/gather.h",
-        "cuda/include/thrust/system/detail/adl/generate.h",
-        "cuda/include/thrust/system/detail/adl/get_value.h",
-        "cuda/include/thrust/system/detail/adl/inner_product.h",
-        "cuda/include/thrust/system/detail/adl/iter_swap.h",
-        "cuda/include/thrust/system/detail/adl/logical.h",
-        "cuda/include/thrust/system/detail/adl/malloc_and_free.h",
-        "cuda/include/thrust/system/detail/adl/merge.h",
-        "cuda/include/thrust/system/detail/adl/mismatch.h",
-        "cuda/include/thrust/system/detail/adl/partition.h",
-        "cuda/include/thrust/system/detail/adl/per_device_resource.h",
-        "cuda/include/thrust/system/detail/adl/reduce.h",
-        "cuda/include/thrust/system/detail/adl/reduce_by_key.h",
-        "cuda/include/thrust/system/detail/adl/remove.h",
-        "cuda/include/thrust/system/detail/adl/replace.h",
-        "cuda/include/thrust/system/detail/adl/reverse.h",
-        "cuda/include/thrust/system/detail/adl/scan.h",
-        "cuda/include/thrust/system/detail/adl/scan_by_key.h",
-        "cuda/include/thrust/system/detail/adl/scatter.h",
-        "cuda/include/thrust/system/detail/adl/sequence.h",
-        "cuda/include/thrust/system/detail/adl/set_operations.h",
-        "cuda/include/thrust/system/detail/adl/sort.h",
-        "cuda/include/thrust/system/detail/adl/swap_ranges.h",
-        "cuda/include/thrust/system/detail/adl/tabulate.h",
-        "cuda/include/thrust/system/detail/adl/temporary_buffer.h",
-        "cuda/include/thrust/system/detail/adl/transform.h",
-        "cuda/include/thrust/system/detail/adl/transform_reduce.h",
-        "cuda/include/thrust/system/detail/adl/transform_scan.h",
-        "cuda/include/thrust/system/detail/adl/uninitialized_copy.h",
-        "cuda/include/thrust/system/detail/adl/uninitialized_fill.h",
-        "cuda/include/thrust/system/detail/adl/unique.h",
-        "cuda/include/thrust/system/detail/adl/unique_by_key.h",
-        "cuda/include/thrust/system/detail/bad_alloc.h",
-        "cuda/include/thrust/system/detail/errno.h",
-        "cuda/include/thrust/system/detail/error_category.inl",
-        "cuda/include/thrust/system/detail/error_code.inl",
-        "cuda/include/thrust/system/detail/error_condition.inl",
-        "cuda/include/thrust/system/detail/generic/adjacent_difference.h",
-        "cuda/include/thrust/system/detail/generic/adjacent_difference.inl",
-        "cuda/include/thrust/system/detail/generic/advance.h",
-        "cuda/include/thrust/system/detail/generic/advance.inl",
-        "cuda/include/thrust/system/detail/generic/binary_search.h",
-        "cuda/include/thrust/system/detail/generic/binary_search.inl",
-        "cuda/include/thrust/system/detail/generic/copy.h",
-        "cuda/include/thrust/system/detail/generic/copy.inl",
-        "cuda/include/thrust/system/detail/generic/copy_if.h",
-        "cuda/include/thrust/system/detail/generic/copy_if.inl",
-        "cuda/include/thrust/system/detail/generic/count.h",
-        "cuda/include/thrust/system/detail/generic/count.inl",
-        "cuda/include/thrust/system/detail/generic/distance.h",
-        "cuda/include/thrust/system/detail/generic/distance.inl",
-        "cuda/include/thrust/system/detail/generic/equal.h",
-        "cuda/include/thrust/system/detail/generic/equal.inl",
-        "cuda/include/thrust/system/detail/generic/extrema.h",
-        "cuda/include/thrust/system/detail/generic/extrema.inl",
-        "cuda/include/thrust/system/detail/generic/fill.h",
-        "cuda/include/thrust/system/detail/generic/find.h",
-        "cuda/include/thrust/system/detail/generic/find.inl",
-        "cuda/include/thrust/system/detail/generic/for_each.h",
-        "cuda/include/thrust/system/detail/generic/gather.h",
-        "cuda/include/thrust/system/detail/generic/gather.inl",
-        "cuda/include/thrust/system/detail/generic/generate.h",
-        "cuda/include/thrust/system/detail/generic/generate.inl",
-        "cuda/include/thrust/system/detail/generic/inner_product.h",
-        "cuda/include/thrust/system/detail/generic/inner_product.inl",
-        "cuda/include/thrust/system/detail/generic/logical.h",
-        "cuda/include/thrust/system/detail/generic/memory.h",
-        "cuda/include/thrust/system/detail/generic/memory.inl",
-        "cuda/include/thrust/system/detail/generic/merge.h",
-        "cuda/include/thrust/system/detail/generic/merge.inl",
-        "cuda/include/thrust/system/detail/generic/mismatch.h",
-        "cuda/include/thrust/system/detail/generic/mismatch.inl",
-        "cuda/include/thrust/system/detail/generic/partition.h",
-        "cuda/include/thrust/system/detail/generic/partition.inl",
-        "cuda/include/thrust/system/detail/generic/per_device_resource.h",
-        "cuda/include/thrust/system/detail/generic/reduce.h",
-        "cuda/include/thrust/system/detail/generic/reduce.inl",
-        "cuda/include/thrust/system/detail/generic/reduce_by_key.h",
-        "cuda/include/thrust/system/detail/generic/reduce_by_key.inl",
-        "cuda/include/thrust/system/detail/generic/remove.h",
-        "cuda/include/thrust/system/detail/generic/remove.inl",
-        "cuda/include/thrust/system/detail/generic/replace.h",
-        "cuda/include/thrust/system/detail/generic/replace.inl",
-        "cuda/include/thrust/system/detail/generic/reverse.h",
-        "cuda/include/thrust/system/detail/generic/reverse.inl",
-        "cuda/include/thrust/system/detail/generic/scalar/binary_search.h",
-        "cuda/include/thrust/system/detail/generic/scalar/binary_search.inl",
-        "cuda/include/thrust/system/detail/generic/scan.h",
-        "cuda/include/thrust/system/detail/generic/scan.inl",
-        "cuda/include/thrust/system/detail/generic/scan_by_key.h",
-        "cuda/include/thrust/system/detail/generic/scan_by_key.inl",
-        "cuda/include/thrust/system/detail/generic/scatter.h",
-        "cuda/include/thrust/system/detail/generic/scatter.inl",
-        "cuda/include/thrust/system/detail/generic/select_system.h",
-        "cuda/include/thrust/system/detail/generic/select_system.inl",
-        "cuda/include/thrust/system/detail/generic/select_system_exists.h",
-        "cuda/include/thrust/system/detail/generic/sequence.h",
-        "cuda/include/thrust/system/detail/generic/sequence.inl",
-        "cuda/include/thrust/system/detail/generic/set_operations.h",
-        "cuda/include/thrust/system/detail/generic/set_operations.inl",
-        "cuda/include/thrust/system/detail/generic/sort.h",
-        "cuda/include/thrust/system/detail/generic/sort.inl",
-        "cuda/include/thrust/system/detail/generic/swap_ranges.h",
-        "cuda/include/thrust/system/detail/generic/swap_ranges.inl",
-        "cuda/include/thrust/system/detail/generic/tabulate.h",
-        "cuda/include/thrust/system/detail/generic/tabulate.inl",
-        "cuda/include/thrust/system/detail/generic/tag.h",
-        "cuda/include/thrust/system/detail/generic/temporary_buffer.h",
-        "cuda/include/thrust/system/detail/generic/temporary_buffer.inl",
-        "cuda/include/thrust/system/detail/generic/transform.h",
-        "cuda/include/thrust/system/detail/generic/transform.inl",
-        "cuda/include/thrust/system/detail/generic/transform_reduce.h",
-        "cuda/include/thrust/system/detail/generic/transform_reduce.inl",
-        "cuda/include/thrust/system/detail/generic/transform_scan.h",
-        "cuda/include/thrust/system/detail/generic/transform_scan.inl",
-        "cuda/include/thrust/system/detail/generic/uninitialized_copy.h",
-        "cuda/include/thrust/system/detail/generic/uninitialized_copy.inl",
-        "cuda/include/thrust/system/detail/generic/uninitialized_fill.h",
-        "cuda/include/thrust/system/detail/generic/uninitialized_fill.inl",
-        "cuda/include/thrust/system/detail/generic/unique.h",
-        "cuda/include/thrust/system/detail/generic/unique.inl",
-        "cuda/include/thrust/system/detail/generic/unique_by_key.h",
-        "cuda/include/thrust/system/detail/generic/unique_by_key.inl",
-        "cuda/include/thrust/system/detail/internal/decompose.h",
-        "cuda/include/thrust/system/detail/sequential/adjacent_difference.h",
-        "cuda/include/thrust/system/detail/sequential/assign_value.h",
-        "cuda/include/thrust/system/detail/sequential/binary_search.h",
-        "cuda/include/thrust/system/detail/sequential/copy.h",
-        "cuda/include/thrust/system/detail/sequential/copy.inl",
-        "cuda/include/thrust/system/detail/sequential/copy_backward.h",
-        "cuda/include/thrust/system/detail/sequential/copy_if.h",
-        "cuda/include/thrust/system/detail/sequential/count.h",
-        "cuda/include/thrust/system/detail/sequential/equal.h",
-        "cuda/include/thrust/system/detail/sequential/execution_policy.h",
-        "cuda/include/thrust/system/detail/sequential/extrema.h",
-        "cuda/include/thrust/system/detail/sequential/fill.h",
-        "cuda/include/thrust/system/detail/sequential/find.h",
-        "cuda/include/thrust/system/detail/sequential/for_each.h",
-        "cuda/include/thrust/system/detail/sequential/gather.h",
-        "cuda/include/thrust/system/detail/sequential/general_copy.h",
-        "cuda/include/thrust/system/detail/sequential/generate.h",
-        "cuda/include/thrust/system/detail/sequential/get_value.h",
-        "cuda/include/thrust/system/detail/sequential/inner_product.h",
-        "cuda/include/thrust/system/detail/sequential/insertion_sort.h",
-        "cuda/include/thrust/system/detail/sequential/iter_swap.h",
-        "cuda/include/thrust/system/detail/sequential/logical.h",
-        "cuda/include/thrust/system/detail/sequential/malloc_and_free.h",
-        "cuda/include/thrust/system/detail/sequential/merge.h",
-        "cuda/include/thrust/system/detail/sequential/merge.inl",
-        "cuda/include/thrust/system/detail/sequential/mismatch.h",
-        "cuda/include/thrust/system/detail/sequential/partition.h",
-        "cuda/include/thrust/system/detail/sequential/per_device_resource.h",
-        "cuda/include/thrust/system/detail/sequential/reduce.h",
-        "cuda/include/thrust/system/detail/sequential/reduce_by_key.h",
-        "cuda/include/thrust/system/detail/sequential/remove.h",
-        "cuda/include/thrust/system/detail/sequential/replace.h",
-        "cuda/include/thrust/system/detail/sequential/reverse.h",
-        "cuda/include/thrust/system/detail/sequential/scan.h",
-        "cuda/include/thrust/system/detail/sequential/scan_by_key.h",
-        "cuda/include/thrust/system/detail/sequential/scatter.h",
-        "cuda/include/thrust/system/detail/sequential/sequence.h",
-        "cuda/include/thrust/system/detail/sequential/set_operations.h",
-        "cuda/include/thrust/system/detail/sequential/sort.h",
-        "cuda/include/thrust/system/detail/sequential/sort.inl",
-        "cuda/include/thrust/system/detail/sequential/stable_merge_sort.h",
-        "cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl",
-        "cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h",
-        "cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl",
-        "cuda/include/thrust/system/detail/sequential/stable_radix_sort.h",
-        "cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl",
-        "cuda/include/thrust/system/detail/sequential/swap_ranges.h",
-        "cuda/include/thrust/system/detail/sequential/tabulate.h",
-        "cuda/include/thrust/system/detail/sequential/temporary_buffer.h",
-        "cuda/include/thrust/system/detail/sequential/transform.h",
-        "cuda/include/thrust/system/detail/sequential/transform_reduce.h",
-        "cuda/include/thrust/system/detail/sequential/transform_scan.h",
-        "cuda/include/thrust/system/detail/sequential/trivial_copy.h",
-        "cuda/include/thrust/system/detail/sequential/uninitialized_copy.h",
-        "cuda/include/thrust/system/detail/sequential/uninitialized_fill.h",
-        "cuda/include/thrust/system/detail/sequential/unique.h",
-        "cuda/include/thrust/system/detail/sequential/unique_by_key.h",
-        "cuda/include/thrust/system/detail/system_error.inl",
-        "cuda/include/thrust/system/error_code.h",
-        "cuda/include/thrust/system/omp/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/omp/detail/assign_value.h",
-        "cuda/include/thrust/system/omp/detail/binary_search.h",
-        "cuda/include/thrust/system/omp/detail/copy.h",
-        "cuda/include/thrust/system/omp/detail/copy.inl",
-        "cuda/include/thrust/system/omp/detail/copy_if.h",
-        "cuda/include/thrust/system/omp/detail/copy_if.inl",
-        "cuda/include/thrust/system/omp/detail/count.h",
-        "cuda/include/thrust/system/omp/detail/default_decomposition.h",
-        "cuda/include/thrust/system/omp/detail/default_decomposition.inl",
-        "cuda/include/thrust/system/omp/detail/equal.h",
-        "cuda/include/thrust/system/omp/detail/execution_policy.h",
-        "cuda/include/thrust/system/omp/detail/extrema.h",
-        "cuda/include/thrust/system/omp/detail/fill.h",
-        "cuda/include/thrust/system/omp/detail/find.h",
-        "cuda/include/thrust/system/omp/detail/for_each.h",
-        "cuda/include/thrust/system/omp/detail/for_each.inl",
-        "cuda/include/thrust/system/omp/detail/gather.h",
-        "cuda/include/thrust/system/omp/detail/generate.h",
-        "cuda/include/thrust/system/omp/detail/get_value.h",
-        "cuda/include/thrust/system/omp/detail/inner_product.h",
-        "cuda/include/thrust/system/omp/detail/iter_swap.h",
-        "cuda/include/thrust/system/omp/detail/logical.h",
-        "cuda/include/thrust/system/omp/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/omp/detail/memory.inl",
-        "cuda/include/thrust/system/omp/detail/merge.h",
-        "cuda/include/thrust/system/omp/detail/mismatch.h",
-        "cuda/include/thrust/system/omp/detail/par.h",
-        "cuda/include/thrust/system/omp/detail/partition.h",
-        "cuda/include/thrust/system/omp/detail/partition.inl",
-        "cuda/include/thrust/system/omp/detail/per_device_resource.h",
-        "cuda/include/thrust/system/omp/detail/pointer.inl",
-        "cuda/include/thrust/system/omp/detail/reduce.h",
-        "cuda/include/thrust/system/omp/detail/reduce.inl",
-        "cuda/include/thrust/system/omp/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/omp/detail/reduce_by_key.inl",
-        "cuda/include/thrust/system/omp/detail/reduce_intervals.h",
-        "cuda/include/thrust/system/omp/detail/reduce_intervals.inl",
-        "cuda/include/thrust/system/omp/detail/remove.h",
-        "cuda/include/thrust/system/omp/detail/remove.inl",
-        "cuda/include/thrust/system/omp/detail/replace.h",
-        "cuda/include/thrust/system/omp/detail/reverse.h",
-        "cuda/include/thrust/system/omp/detail/scan.h",
-        "cuda/include/thrust/system/omp/detail/scan_by_key.h",
-        "cuda/include/thrust/system/omp/detail/scatter.h",
-        "cuda/include/thrust/system/omp/detail/sequence.h",
-        "cuda/include/thrust/system/omp/detail/set_operations.h",
-        "cuda/include/thrust/system/omp/detail/sort.h",
-        "cuda/include/thrust/system/omp/detail/sort.inl",
-        "cuda/include/thrust/system/omp/detail/swap_ranges.h",
-        "cuda/include/thrust/system/omp/detail/tabulate.h",
-        "cuda/include/thrust/system/omp/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/omp/detail/transform.h",
-        "cuda/include/thrust/system/omp/detail/transform_reduce.h",
-        "cuda/include/thrust/system/omp/detail/transform_scan.h",
-        "cuda/include/thrust/system/omp/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/omp/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/omp/detail/unique.h",
-        "cuda/include/thrust/system/omp/detail/unique.inl",
-        "cuda/include/thrust/system/omp/detail/unique_by_key.h",
-        "cuda/include/thrust/system/omp/detail/unique_by_key.inl",
-        "cuda/include/thrust/system/omp/detail/vector.inl",
-        "cuda/include/thrust/system/omp/execution_policy.h",
-        "cuda/include/thrust/system/omp/memory.h",
-        "cuda/include/thrust/system/omp/memory_resource.h",
-        "cuda/include/thrust/system/omp/pointer.h",
-        "cuda/include/thrust/system/omp/vector.h",
-        "cuda/include/thrust/system/system_error.h",
-        "cuda/include/thrust/system/tbb/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/tbb/detail/assign_value.h",
-        "cuda/include/thrust/system/tbb/detail/binary_search.h",
-        "cuda/include/thrust/system/tbb/detail/copy.h",
-        "cuda/include/thrust/system/tbb/detail/copy.inl",
-        "cuda/include/thrust/system/tbb/detail/copy_if.h",
-        "cuda/include/thrust/system/tbb/detail/copy_if.inl",
-        "cuda/include/thrust/system/tbb/detail/count.h",
-        "cuda/include/thrust/system/tbb/detail/equal.h",
-        "cuda/include/thrust/system/tbb/detail/execution_policy.h",
-        "cuda/include/thrust/system/tbb/detail/extrema.h",
-        "cuda/include/thrust/system/tbb/detail/fill.h",
-        "cuda/include/thrust/system/tbb/detail/find.h",
-        "cuda/include/thrust/system/tbb/detail/for_each.h",
-        "cuda/include/thrust/system/tbb/detail/for_each.inl",
-        "cuda/include/thrust/system/tbb/detail/gather.h",
-        "cuda/include/thrust/system/tbb/detail/generate.h",
-        "cuda/include/thrust/system/tbb/detail/get_value.h",
-        "cuda/include/thrust/system/tbb/detail/inner_product.h",
-        "cuda/include/thrust/system/tbb/detail/iter_swap.h",
-        "cuda/include/thrust/system/tbb/detail/logical.h",
-        "cuda/include/thrust/system/tbb/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/tbb/detail/memory.inl",
-        "cuda/include/thrust/system/tbb/detail/merge.h",
-        "cuda/include/thrust/system/tbb/detail/merge.inl",
-        "cuda/include/thrust/system/tbb/detail/mismatch.h",
-        "cuda/include/thrust/system/tbb/detail/par.h",
-        "cuda/include/thrust/system/tbb/detail/partition.h",
-        "cuda/include/thrust/system/tbb/detail/partition.inl",
-        "cuda/include/thrust/system/tbb/detail/per_device_resource.h",
-        "cuda/include/thrust/system/tbb/detail/pointer.inl",
-        "cuda/include/thrust/system/tbb/detail/reduce.h",
-        "cuda/include/thrust/system/tbb/detail/reduce.inl",
-        "cuda/include/thrust/system/tbb/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/tbb/detail/reduce_by_key.inl",
-        "cuda/include/thrust/system/tbb/detail/reduce_intervals.h",
-        "cuda/include/thrust/system/tbb/detail/remove.h",
-        "cuda/include/thrust/system/tbb/detail/remove.inl",
-        "cuda/include/thrust/system/tbb/detail/replace.h",
-        "cuda/include/thrust/system/tbb/detail/reverse.h",
-        "cuda/include/thrust/system/tbb/detail/scan.h",
-        "cuda/include/thrust/system/tbb/detail/scan.inl",
-        "cuda/include/thrust/system/tbb/detail/scan_by_key.h",
-        "cuda/include/thrust/system/tbb/detail/scatter.h",
-        "cuda/include/thrust/system/tbb/detail/sequence.h",
-        "cuda/include/thrust/system/tbb/detail/set_operations.h",
-        "cuda/include/thrust/system/tbb/detail/sort.h",
-        "cuda/include/thrust/system/tbb/detail/sort.inl",
-        "cuda/include/thrust/system/tbb/detail/swap_ranges.h",
-        "cuda/include/thrust/system/tbb/detail/tabulate.h",
-        "cuda/include/thrust/system/tbb/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/tbb/detail/transform.h",
-        "cuda/include/thrust/system/tbb/detail/transform_reduce.h",
-        "cuda/include/thrust/system/tbb/detail/transform_scan.h",
-        "cuda/include/thrust/system/tbb/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/tbb/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/tbb/detail/unique.h",
-        "cuda/include/thrust/system/tbb/detail/unique.inl",
-        "cuda/include/thrust/system/tbb/detail/unique_by_key.h",
-        "cuda/include/thrust/system/tbb/detail/unique_by_key.inl",
-        "cuda/include/thrust/system/tbb/detail/vector.inl",
-        "cuda/include/thrust/system/tbb/execution_policy.h",
-        "cuda/include/thrust/system/tbb/memory.h",
-        "cuda/include/thrust/system/tbb/memory_resource.h",
-        "cuda/include/thrust/system/tbb/pointer.h",
-        "cuda/include/thrust/system/tbb/vector.h",
-        "cuda/include/thrust/system_error.h",
-        "cuda/include/thrust/tabulate.h",
-        "cuda/include/thrust/transform.h",
-        "cuda/include/thrust/transform_reduce.h",
-        "cuda/include/thrust/transform_scan.h",
-        "cuda/include/thrust/tuple.h",
-        "cuda/include/thrust/type_traits/integer_sequence.h",
-        "cuda/include/thrust/type_traits/is_contiguous_iterator.h",
-        "cuda/include/thrust/type_traits/is_execution_policy.h",
-        "cuda/include/thrust/type_traits/is_operator_less_or_greater_function_object.h",
-        "cuda/include/thrust/type_traits/is_operator_plus_function_object.h",
-        "cuda/include/thrust/type_traits/is_trivially_relocatable.h",
-        "cuda/include/thrust/type_traits/logical_metafunctions.h",
-        "cuda/include/thrust/type_traits/remove_cvref.h",
-        "cuda/include/thrust/type_traits/void_t.h",
-        "cuda/include/thrust/uninitialized_copy.h",
-        "cuda/include/thrust/uninitialized_fill.h",
-        "cuda/include/thrust/unique.h",
-        "cuda/include/thrust/version.h",
-        "cuda/include/vector_functions.h",
-        "cuda/include/vector_functions.hpp",
-        "cuda/include/vector_types.h",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.1/include/." "$(@D)/cuda/include/" """,
-)
-
-genrule(
-    name = "cuda-nvvm",
-    outs = [
-        "cuda/nvvm/libdevice/libdevice.10.bc",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.1/nvvm/libdevice/." "$(@D)/" """,
-)
-
-genrule(
-    name = "cuda-extras",
-    outs = [
-        "cuda/extras/CUPTI/include/Openacc/cupti_openacc.h",
-        "cuda/extras/CUPTI/include/Openmp/cupti_openmp.h",
-        "cuda/extras/CUPTI/include/Openmp/ompt.h",
-        "cuda/extras/CUPTI/include/cuda_stdint.h",
-        "cuda/extras/CUPTI/include/cupti.h",
-        "cuda/extras/CUPTI/include/cupti_activity.h",
-        "cuda/extras/CUPTI/include/cupti_callbacks.h",
-        "cuda/extras/CUPTI/include/cupti_driver_cbid.h",
-        "cuda/extras/CUPTI/include/cupti_events.h",
-        "cuda/extras/CUPTI/include/cupti_metrics.h",
-        "cuda/extras/CUPTI/include/cupti_nvtx_cbid.h",
-        "cuda/extras/CUPTI/include/cupti_profiler_target.h",
-        "cuda/extras/CUPTI/include/cupti_result.h",
-        "cuda/extras/CUPTI/include/cupti_runtime_cbid.h",
-        "cuda/extras/CUPTI/include/cupti_target.h",
-        "cuda/extras/CUPTI/include/cupti_version.h",
-        "cuda/extras/CUPTI/include/generated_cudaGL_meta.h",
-        "cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h",
-        "cuda/extras/CUPTI/include/generated_nvtx_meta.h",
-        "cuda/extras/CUPTI/include/nvperf_cuda_host.h",
-        "cuda/extras/CUPTI/include/nvperf_host.h",
-        "cuda/extras/CUPTI/include/nvperf_target.h",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.1/extras/CUPTI/include/." "$(@D)/cuda/extras/CUPTI/include/" """,
-)
-
-genrule(
-    name = "cublas-include",
-    outs = [
-        "cublas/include/cublas.h",
-        "cublas/include/cublas_v2.h",
-        "cublas/include/cublas_api.h",
-    ],
-    cmd = """cp -f "/usr/include/cublas.h" "$(location cublas/include/cublas.h)" && \
-cp -f "/usr/include/cublas_v2.h" "$(location cublas/include/cublas_v2.h)" && \
-cp -f "/usr/include/cublas_api.h" "$(location cublas/include/cublas_api.h)" """,
-)
-
-genrule(
-    name = "cuda-lib",
-    outs = [
-        "cuda/lib/libcuda.so",
-        "cuda/lib/libcudart.so.10.1",
-        "cuda/lib/libcudart_static.a",
-        "cuda/lib/libcublas.so.10",
-        "cuda/lib/libcusolver.so.10",
-        "cuda/lib/libcurand.so.10",
-        "cuda/lib/libcufft.so.10",
-        "cuda/lib/libcudnn.so.7",
-        "cuda/lib/libcupti.so.10.1",
-        "cuda/lib/libcusparse.so.10",
-    ],
-    cmd = """cp -f "/usr/local/cuda-10.1/lib64/stubs/libcuda.so" "$(location cuda/lib/libcuda.so)" && \
-cp -f "/usr/local/cuda-10.1/lib64/libcudart.so.10.1" "$(location cuda/lib/libcudart.so.10.1)" && \
-cp -f "/usr/local/cuda-10.1/lib64/libcudart_static.a" "$(location cuda/lib/libcudart_static.a)" && \
-cp -f "/usr/lib64/libcublas.so.10" "$(location cuda/lib/libcublas.so.10)" && \
-cp -f "/usr/local/cuda-10.1/lib64/libcusolver.so.10" "$(location cuda/lib/libcusolver.so.10)" && \
-cp -f "/usr/local/cuda-10.1/lib64/libcurand.so.10" "$(location cuda/lib/libcurand.so.10)" && \
-cp -f "/usr/local/cuda-10.1/lib64/libcufft.so.10" "$(location cuda/lib/libcufft.so.10)" && \
-cp -f "/usr/local/cuda-10.1/lib64/libcudnn.so.7" "$(location cuda/lib/libcudnn.so.7)" && \
-cp -f "/usr/local/cuda-10.1/extras/CUPTI/lib64/libcupti.so.10.1" "$(location cuda/lib/libcupti.so.10.1)" && \
-cp -f "/usr/local/cuda-10.1/lib64/libcusparse.so.10" "$(location cuda/lib/libcusparse.so.10)" """,
-)
-
-genrule(
-    name = "cuda-bin",
-    outs = [
-        "cuda/bin/bin2c",
-        "cuda/bin/crt/link.stub",
-        "cuda/bin/crt/prelink.stub",
-        "cuda/bin/cuda-gdb",
-        "cuda/bin/cuda-gdbserver",
-        "cuda/bin/cuda-memcheck",
-        "cuda/bin/cudafe++",
-        "cuda/bin/cuobjdump",
-        "cuda/bin/fatbinary",
-        "cuda/bin/gpu-library-advisor",
-        "cuda/bin/nvcc",
-        "cuda/bin/nvcc.profile",
-        "cuda/bin/nvdisasm",
-        "cuda/bin/nvlink",
-        "cuda/bin/nvprof",
-        "cuda/bin/nvprune",
-        "cuda/bin/ptxas",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.1/bin/." "$(@D)/cuda/bin/" """,
-)
-
-genrule(
-    name = "cudnn-include",
-    outs = [
-        "cudnn/include/cudnn.h",
-    ],
-    cmd = """cp -f "/usr/local/cuda-10.1/include/cudnn.h" "$(location cudnn/include/cudnn.h)" """,
-)
diff --git a/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/build_defs.bzl b/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/build_defs.bzl
deleted file mode 100755
index fe5ddff6572..00000000000
--- a/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/build_defs.bzl
+++ /dev/null
@@ -1,65 +0,0 @@
-"""Macros for building CUDA code."""
-
-def if_cuda(if_true, if_false = []):
-    """Shorthand for select()'ing on whether we're building with CUDA.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with CUDA enabled.  Otherwise, the select statement evaluates to if_false.
-
-    """
-    return select({
-        "@local_config_cuda//cuda:using_nvcc": if_true,
-        "@local_config_cuda//cuda:using_clang": if_true,
-        "//conditions:default": if_false,
-    })
-
-def cuda_default_copts():
-    """Default options for all CUDA compilations."""
-    return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + [])
-
-def cuda_is_configured():
-    """Returns true if CUDA was enabled during the configure process."""
-    return True
-
-def if_cuda_is_configured(x):
-    """Tests if the CUDA was enabled during the configure process.
-
-    Unlike if_cuda(), this does not require that we are building with
-    --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries.
-    """
-    if cuda_is_configured():
-        return select({"//conditions:default": x})
-    return select({"//conditions:default": []})
-
-def cuda_header_library(
-        name,
-        hdrs,
-        include_prefix = None,
-        strip_include_prefix = None,
-        deps = [],
-        **kwargs):
-    """Generates a cc_library containing both virtual and system include paths.
-
-    Generates both a header-only target with virtual includes plus the full
-    target without virtual includes. This works around the fact that bazel can't
-    mix 'includes' and 'include_prefix' in the same target."""
-
-    native.cc_library(
-        name = name + "_virtual",
-        hdrs = hdrs,
-        include_prefix = include_prefix,
-        strip_include_prefix = strip_include_prefix,
-        deps = deps,
-        visibility = ["//visibility:private"],
-    )
-
-    native.cc_library(
-        name = name,
-        textual_hdrs = hdrs,
-        deps = deps + [":%s_virtual" % name],
-        **kwargs
-    )
-
-def cuda_library(copts = [], **kwargs):
-    """Wrapper over cc_library which adds default CUDA options."""
-    native.cc_library(copts = cuda_default_copts() + copts, **kwargs)
diff --git a/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/cuda/cuda_config.h b/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/cuda/cuda_config.h
deleted file mode 100755
index f7e84335e6b..00000000000
--- a/third_party/toolchains/preconfig/centos6/cuda10.1-cudnn7/cuda/cuda/cuda_config.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef CUDA_CUDA_CONFIG_H_
-#define CUDA_CUDA_CONFIG_H_
-
-#define TF_CUDA_CAPABILITIES CudaVersion("3.0"), CudaVersion("6.0")
-
-#define TF_CUDA_VERSION "10.1"
-#define TF_CUDA_LIB_VERSION "10"
-#define TF_CUDNN_VERSION "7"
-
-#define TF_CUDA_TOOLKIT_PATH "/usr/local/cuda-10.1"
-
-#endif  // CUDA_CUDA_CONFIG_H_
diff --git a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/BUILD b/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/BUILD
deleted file mode 100755
index 6b94c9a1e12..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/BUILD
+++ /dev/null
@@ -1,170 +0,0 @@
-# This file is expanded from a template by cuda_configure.bzl
-# Update cuda_configure.bzl#verify_build_defines when adding new variables.
-
-load(":cc_toolchain_config.bzl", "cc_toolchain_config")
-
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-toolchain(
-    name = "toolchain-linux-x86_64",
-    exec_compatible_with = [
-        "@bazel_tools//platforms:linux",
-        "@bazel_tools//platforms:x86_64",
-    ],
-    target_compatible_with = [
-        "@bazel_tools//platforms:linux",
-        "@bazel_tools//platforms:x86_64",
-    ],
-    toolchain = ":cc-compiler-local",
-    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
-)
-
-cc_toolchain_suite(
-    name = "toolchain",
-    toolchains = {
-        "local|compiler": ":cc-compiler-local",
-        "darwin|compiler": ":cc-compiler-darwin",
-        "x64_windows|msvc-cl": ":cc-compiler-windows",
-        "x64_windows": ":cc-compiler-windows",
-        "arm": ":cc-compiler-local",
-        "aarch64": ":cc-compiler-local",
-        "k8": ":cc-compiler-local",
-        "piii": ":cc-compiler-local",
-        "ppc": ":cc-compiler-local",
-        "darwin": ":cc-compiler-darwin",
-    },
-)
-
-cc_toolchain(
-    name = "cc-compiler-local",
-    all_files = ":crosstool_wrapper_driver_is_not_gcc",
-    compiler_files = ":empty",
-    dwp_files = ":empty",
-    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    # To support linker flags that need to go to the start of command line
-    # we need the toolchain to support parameter files. Parameter files are
-    # last on the command line and contain all shared libraries to link, so all
-    # regular options will be left of them.
-    supports_param_files = 1,
-    toolchain_config = ":cc-compiler-local-config",
-    toolchain_identifier = "local_linux",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-local-config",
-    builtin_include_directories = [
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/backward",
-        "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include",
-        "/usr/local/include",
-        "/opt/rh/devtoolset-7/root/usr/include",
-        "/usr/include",
-        "/usr/local/cuda-10.0/targets/x86_64-linux/include",
-        "/usr/local/cuda-10.0/include",
-        "/usr/local/cuda-10.0/extras/CUPTI/include",
-        "/usr/local/cuda-10.0/include",
-    ],
-    cpu = "local",
-    extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
-    host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc",
-    host_compiler_prefix = "/opt/rh/devtoolset-7/root/usr/bin",
-    host_compiler_warnings = [],
-    host_unfiltered_compile_flags = [],
-    linker_bin_path = "/opt/rh/devtoolset-7/root/usr/bin",
-)
-
-cc_toolchain(
-    name = "cc-compiler-darwin",
-    all_files = ":crosstool_wrapper_driver_is_not_gcc",
-    compiler_files = ":empty",
-    dwp_files = ":empty",
-    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 0,
-    toolchain_config = ":cc-compiler-local-darwin",
-    toolchain_identifier = "local_darwin",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-local-darwin",
-    builtin_include_directories = [
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/backward",
-        "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include",
-        "/usr/local/include",
-        "/opt/rh/devtoolset-7/root/usr/include",
-        "/usr/include",
-        "/usr/local/cuda-10.0/targets/x86_64-linux/include",
-        "/usr/local/cuda-10.0/include",
-        "/usr/local/cuda-10.0/extras/CUPTI/include",
-        "/usr/local/cuda-10.0/include",
-    ],
-    cpu = "darwin",
-    extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
-    host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc",
-    host_compiler_prefix = "/opt/rh/devtoolset-7/root/usr/bin",
-    host_compiler_warnings = [],
-    host_unfiltered_compile_flags = [],
-    linker_bin_path = "/opt/rh/devtoolset-7/root/usr/bin",
-)
-
-cc_toolchain(
-    name = "cc-compiler-windows",
-    all_files = ":windows_msvc_wrapper_files",
-    compiler_files = ":empty",
-    dwp_files = ":empty",
-    linker_files = ":windows_msvc_wrapper_files",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 1,
-    toolchain_config = ":cc-compiler-windows-config",
-    toolchain_identifier = "local_windows",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-windows-config",
-    builtin_include_directories = [
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/backward",
-        "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include",
-        "/usr/local/include",
-        "/opt/rh/devtoolset-7/root/usr/include",
-        "/usr/include",
-        "/usr/local/cuda-10.0/targets/x86_64-linux/include",
-        "/usr/local/cuda-10.0/include",
-        "/usr/local/cuda-10.0/extras/CUPTI/include",
-        "/usr/local/cuda-10.0/include",
-    ],
-    cpu = "x64_windows",
-    msvc_cl_path = "msvc_not_used",
-    msvc_env_include = "msvc_not_used",
-    msvc_env_lib = "msvc_not_used",
-    msvc_env_path = "msvc_not_used",
-    msvc_env_tmp = "msvc_not_used",
-    msvc_lib_path = "msvc_not_used",
-    msvc_link_path = "msvc_not_used",
-    msvc_ml_path = "msvc_not_used",
-)
-
-filegroup(
-    name = "empty",
-    srcs = [],
-)
-
-filegroup(
-    name = "crosstool_wrapper_driver_is_not_gcc",
-    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
-)
-
-filegroup(
-    name = "windows_msvc_wrapper_files",
-    srcs = glob(["windows/msvc_*"]),
-)
diff --git a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/CROSSTOOL b/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/CROSSTOOL
deleted file mode 100755
index 3e1ca08218f..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/CROSSTOOL
+++ /dev/null
@@ -1,1429 +0,0 @@
-major_version: "local"
-minor_version: ""
-default_target_cpu: "same_as_host"
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  target_libc: "local"
-  target_cpu: "local"
-  target_system_name: "local"
-  toolchain_identifier: "local_linux"
-
-  feature {
-    name: "c++11"
-    flag_set {
-      action: "c++-compile"
-      flag_group {
-        flag: "-std=c++11"
-      }
-    }
-  }
-
-  feature {
-    name: "stdlib"
-    flag_set {
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "-lstdc++"
-      }
-    }
-  }
-
-  feature {
-    name: "determinism"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # Make C++ compilation deterministic. Use linkstamping instead of these
-        # compiler symbols.
-        flag: "-Wno-builtin-macro-redefined"
-        flag: "-D__DATE__=\"redacted\""
-        flag: "-D__TIMESTAMP__=\"redacted\""
-        flag: "-D__TIME__=\"redacted\""
-      }
-    }
-  }
-
-  feature {
-    name: "alwayslink"
-    flag_set {
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-executable"
-      flag_group {
-        flag: "-Wl,-no-as-needed"
-      }
-    }
-  }
-
-  # This feature will be enabled for builds that support pic by bazel.
-  feature {
-    name: "pic"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        expand_if_all_available: "pic"
-        flag: "-fPIC"
-      }
-      flag_group {
-        expand_if_none_available: "pic"
-        flag: "-fPIE"
-      }
-    }
-  }
-
-  # Security hardening on by default.
-  feature {
-    name: "hardening"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
-        # We need to undef it before redefining it as some distributions now
-        # have it enabled by default.
-        flag: "-U_FORTIFY_SOURCE"
-        flag: "-D_FORTIFY_SOURCE=1"
-        flag: "-fstack-protector"
-      }
-    }
-    flag_set {
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "-Wl,-z,relro,-z,now"
-      }
-    }
-    flag_set {
-      action: "c++-link-executable"
-      flag_group {
-        flag: "-pie"
-        flag: "-Wl,-z,relro,-z,now"
-      }
-    }
-  }
-
-  feature {
-    name: "warnings"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # All warnings are enabled. Maybe enable -Werror as well?
-        flag: "-Wall"
-        
-      }
-    }
-  }
-
-  # Keep stack frames for debugging, even in opt mode.
-  feature {
-    name: "frame-pointer"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-fno-omit-frame-pointer"
-      }
-    }
-  }
-
-  feature {
-    name: "build-id"
-    flag_set {
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        # Stamp the binary with a unique identifier.
-        flag: "-Wl,--build-id=md5"
-        flag: "-Wl,--hash-style=gnu"
-      }
-    }
-  }
-
-  feature {
-    name: "no-canonical-prefixes"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "-no-canonical-prefixes"
-        flag: "-fno-canonical-system-headers"
-      }
-    }
-  }
-
-  feature {
-    name: "disable-assertions"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-DNDEBUG"
-      }
-    }
-  }
-
-  feature {
-    name: "linker-bin-path"
-
-    flag_set {
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "-B/opt/rh/devtoolset-7/root/usr/bin"
-      }
-    }
-  }
-
-  feature {
-    name: "common"
-    implies: "stdlib"
-    implies: "c++11"
-    implies: "determinism"
-    implies: "alwayslink"
-    implies: "hardening"
-    implies: "warnings"
-    implies: "frame-pointer"
-    implies: "build-id"
-    implies: "no-canonical-prefixes"
-    implies: "linker-bin-path"
-  }
-
-  feature {
-    name: "opt"
-    implies: "common"
-    implies: "disable-assertions"
-
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # No debug symbols.
-        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
-        # or even generally? However, that can't happen here, as it requires
-        # special handling in Bazel.
-        flag: "-g0"
-
-        # Conservative choice for -O
-        # -O3 can increase binary size and even slow down the resulting binaries.
-        # Profile first and / or use FDO if you need better performance than this.
-        flag: "-O2"
-
-        # Removal of unused code and data at link time (can this increase binary size in some cases?).
-        flag: "-ffunction-sections"
-        flag: "-fdata-sections"
-      }
-    }
-    flag_set {
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-executable"
-      flag_group {
-        flag: "-Wl,--gc-sections"
-      }
-    }
-  }
-
-  feature {
-    name: "fastbuild"
-    implies: "common"
-  }
-
-  feature {
-    name: "dbg"
-    implies: "common"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-g"
-      }
-    }
-  }
-
-  # Set clang as a C/C++ compiler.
-  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
-
-  # Use the default system toolchain for everything else.
-  tool_path { name: "ar" path: "/opt/rh/devtoolset-7/root/usr/bin/ar" }
-  tool_path { name: "compat-ld" path: "/opt/rh/devtoolset-7/root/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/opt/rh/devtoolset-7/root/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/opt/rh/devtoolset-7/root/usr/bin/dwp" }
-  tool_path { name: "gcov" path: "/opt/rh/devtoolset-7/root/usr/bin/gcov" }
-  tool_path { name: "ld" path: "/opt/rh/devtoolset-7/root/usr/bin/ld" }
-  tool_path { name: "nm" path: "/opt/rh/devtoolset-7/root/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/opt/rh/devtoolset-7/root/usr/bin/objcopy" }
-  tool_path { name: "objdump" path: "/opt/rh/devtoolset-7/root/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/opt/rh/devtoolset-7/root/usr/bin/strip" }
-
-  # Enabled dynamic linking.
-  linking_mode_flags { mode: DYNAMIC }
-
-  cxx_builtin_include_directory: "/opt/rh/devtoolset-7/root/usr/include/c++/7"
-  cxx_builtin_include_directory: "/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux"
-  cxx_builtin_include_directory: "/opt/rh/devtoolset-7/root/usr/include/c++/7/backward"
-  cxx_builtin_include_directory: "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include"
-  cxx_builtin_include_directory: "/usr/local/include"
-  cxx_builtin_include_directory: "/opt/rh/devtoolset-7/root/usr/include"
-  cxx_builtin_include_directory: "/usr/include"
-  cxx_builtin_include_directory: "/usr/local/cuda-10.0/targets/x86_64-linux/include"
-  cxx_builtin_include_directory: "/usr/local/cuda-10.0/include"
-  cxx_builtin_include_directory: "/usr/local/cuda-10.0/extras/CUPTI/include"
-  cxx_builtin_include_directory: "/usr/local/cuda-10.0/include"
-}
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  target_libc: "macosx"
-  target_cpu: "darwin"
-  target_system_name: "local"
-  toolchain_identifier: "local_darwin"
-  feature {
-    name: "c++11"
-    flag_set {
-      action: "c++-compile"
-      flag_group {
-        flag: "-std=c++11"
-      }
-    }
-  }
-
-  feature {
-    name: "stdlib"
-    flag_set {
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "-lc++"
-      }
-    }
-  }
-
-  feature {
-    name: "determinism"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # Make C++ compilation deterministic. Use linkstamping instead of these
-        # compiler symbols.
-        flag: "-Wno-builtin-macro-redefined"
-        flag: "-D__DATE__=\"redacted\""
-        flag: "-D__TIMESTAMP__=\"redacted\""
-        flag: "-D__TIME__=\"redacted\""
-      }
-    }
-  }
-
-  # This feature will be enabled for builds that support pic by bazel.
-  feature {
-    name: "pic"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        expand_if_all_available: "pic"
-        flag: "-fPIC"
-      }
-      flag_group {
-        expand_if_none_available: "pic"
-        flag: "-fPIE"
-      }
-    }
-  }
-
-  # Security hardening on by default.
-  feature {
-    name: "hardening"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
-        # We need to undef it before redefining it as some distributions now
-        # have it enabled by default.
-        flag: "-U_FORTIFY_SOURCE"
-        flag: "-D_FORTIFY_SOURCE=1"
-        flag: "-fstack-protector"
-      }
-    }
-    flag_set {
-      action: "c++-link-executable"
-      flag_group {
-        flag: "-pie"
-      }
-    }
-  }
-
-  feature {
-    name: "warnings"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # All warnings are enabled. Maybe enable -Werror as well?
-        flag: "-Wall"
-        
-      }
-    }
-  }
-
-  # Keep stack frames for debugging, even in opt mode.
-  feature {
-    name: "frame-pointer"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-fno-omit-frame-pointer"
-      }
-    }
-  }
-
-  feature {
-    name: "no-canonical-prefixes"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag:"-no-canonical-prefixes"
-      }
-    }
-  }
-
-  feature {
-    name: "disable-assertions"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-DNDEBUG"
-      }
-    }
-  }
-
-  feature {
-    name: "linker-bin-path"
-
-    flag_set {
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "-B/opt/rh/devtoolset-7/root/usr/bin"
-      }
-    }
-  }
-
-  feature {
-    name: "undefined-dynamic"
-    flag_set {
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-executable"
-      flag_group {
-        flag: "-undefined"
-        flag: "dynamic_lookup"
-      }
-    }
-  }
-
-  feature {
-    name: "common"
-    implies: "stdlib"
-    implies: "c++11"
-    implies: "determinism"
-    implies: "hardening"
-    implies: "warnings"
-    implies: "frame-pointer"
-    implies: "no-canonical-prefixes"
-    implies: "linker-bin-path"
-    implies: "undefined-dynamic"
-  }
-
-  feature {
-    name: "opt"
-    implies: "common"
-    implies: "disable-assertions"
-
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # No debug symbols.
-        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
-        # or even generally? However, that can't happen here, as it requires
-        # special handling in Bazel.
-        flag: "-g0"
-
-        # Conservative choice for -O
-        # -O3 can increase binary size and even slow down the resulting binaries.
-        # Profile first and / or use FDO if you need better performance than this.
-        flag: "-O2"
-
-        # Removal of unused code and data at link time (can this increase binary size in some cases?).
-        flag: "-ffunction-sections"
-        flag: "-fdata-sections"
-      }
-    }
-  }
-
-  feature {
-    name: "fastbuild"
-    implies: "common"
-  }
-
-  feature {
-    name: "dbg"
-    implies: "common"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-g"
-      }
-    }
-  }
-
-  # Set clang as a C/C++ compiler.
-  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
-
-  # Use the default system toolchain for everything else.
-  tool_path { name: "ar" path: "/usr/bin/libtool" }
-  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  tool_path { name: "gcov" path: "/usr/bin/gcov" }
-  tool_path { name: "ld" path: "/usr/bin/ld" }
-  tool_path { name: "nm" path: "/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
-  tool_path { name: "objdump" path: "/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/usr/bin/strip" }
-
-  # Enabled dynamic linking.
-  linking_mode_flags { mode: DYNAMIC }
-
-  cxx_builtin_include_directory: "/opt/rh/devtoolset-7/root/usr/include/c++/7"
-  cxx_builtin_include_directory: "/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux"
-  cxx_builtin_include_directory: "/opt/rh/devtoolset-7/root/usr/include/c++/7/backward"
-  cxx_builtin_include_directory: "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include"
-  cxx_builtin_include_directory: "/usr/local/include"
-  cxx_builtin_include_directory: "/opt/rh/devtoolset-7/root/usr/include"
-  cxx_builtin_include_directory: "/usr/include"
-  cxx_builtin_include_directory: "/usr/local/cuda-10.0/targets/x86_64-linux/include"
-  cxx_builtin_include_directory: "/usr/local/cuda-10.0/include"
-  cxx_builtin_include_directory: "/usr/local/cuda-10.0/extras/CUPTI/include"
-  cxx_builtin_include_directory: "/usr/local/cuda-10.0/include"
-}
-
-toolchain {
-  toolchain_identifier: "local_windows"
-  host_system_name: "local"
-  target_system_name: "local"
-
-  abi_version: "local"
-  abi_libc_version: "local"
-  target_cpu: "x64_windows"
-  compiler: "msvc-cl"
-  target_libc: "msvcrt"
-
-
-
-  tool_path {
-    name: "ar"
-    path: ""
-  }
-  tool_path {
-    name: "ml"
-    path: ""
-  }
-  tool_path {
-    name: "cpp"
-    path: ""
-  }
-  tool_path {
-    name: "gcc"
-    path: ""
-  }
-  tool_path {
-    name: "gcov"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "ld"
-    path: ""
-  }
-  tool_path {
-    name: "nm"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "objcopy"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "objdump"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "strip"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  supports_interface_shared_objects: true
-
-  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
-  compiler_flag: "/DCOMPILER_MSVC"
-
-  # Don't define min/max macros in windows.h.
-  compiler_flag: "/DNOMINMAX"
-
-  # Platform defines.
-  compiler_flag: "/D_WIN32_WINNT=0x0600"
-  # Turn off warning messages.
-  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
-  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
-  compiler_flag: "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS"
-
-  # Useful options to have on for compilation.
-  # Increase the capacity of object files to 2^32 sections.
-  compiler_flag: "/bigobj"
-  # Allocate 500MB for precomputed headers.
-  compiler_flag: "/Zm500"
-  # Use unsigned char by default.
-  compiler_flag: "/J"
-  # Use function level linking.
-  compiler_flag: "/Gy"
-  # Use string pooling.
-  compiler_flag: "/GF"
-  # Catch C++ exceptions only and tell the compiler to assume that functions declared
-  # as extern "C" never throw a C++ exception.
-  compiler_flag: "/EHsc"
-
-  # Globally disabled warnings.
-  # Don't warn about elements of array being be default initialized.
-  compiler_flag: "/wd4351"
-  # Don't warn about no matching delete found.
-  compiler_flag: "/wd4291"
-  # Don't warn about diamond inheritance patterns.
-  compiler_flag: "/wd4250"
-  # Don't warn about insecure functions (e.g. non _s functions).
-  compiler_flag: "/wd4996"
-
-  linker_flag: "/MACHINE:X64"
-
-  feature {
-    name: "no_legacy_features"
-  }
-
-  # TODO(klimek): Previously we were using a .bat file to start python to run
-  # the python script that can redirect to nvcc - unfortunately .bat files
-  # have a rather short maximum length for command lines (8k). Instead, we
-  # now use the python binary as the compiler and pass the python script to
-  # it at the start of the command line. Investigate different possibilities
-  # to run the nvcc wrapper, either using pyinstaller --onefile, or writing
-  # a small C++ wrapper to redirect.
-  feature {
-    name: "redirector"
-    enabled: true
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-module-compile"
-      action: "c++-module-codegen"
-      action: "c++-header-parsing"
-      action: "assemble"
-      action: "preprocess-assemble"
-      flag_group {
-        flag: "-B"
-        flag: "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py"
-      }
-    }
-  }
-
-  # Suppress startup banner.
-  feature {
-    name: "nologo"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-module-compile"
-      action: "c++-module-codegen"
-      action: "c++-header-parsing"
-      action: "assemble"
-      action: "preprocess-assemble"
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-static-library"
-      flag_group {
-        flag: "/nologo"
-      }
-    }
-  }
-
-  feature {
-    name: 'has_configured_linker_path'
-  }
-
-  # This feature indicates strip is not supported, building stripped binary will just result a copy of orignial binary
-  feature {
-    name: 'no_stripping'
-  }
-
-  # This feature indicates this is a toolchain targeting Windows.
-  feature {
-    name: 'targets_windows'
-    implies: 'copy_dynamic_libraries_to_binary'
-    enabled: true
-  }
-
-  feature {
-    name: 'copy_dynamic_libraries_to_binary'
-  }
-
-  action_config {
-    config_name: 'assemble'
-    action_name: 'assemble'
-    tool {
-      tool_path: ''
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'sysroot'
-  }
-
-  action_config {
-    config_name: 'preprocess-assemble'
-    action_name: 'preprocess-assemble'
-    tool {
-      tool_path: ''
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'sysroot'
-  }
-
-  action_config {
-    config_name: 'c-compile'
-    action_name: 'c-compile'
-    tool {
-      tool_path: ''
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'legacy_compile_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'parse_showincludes'
-    implies: 'user_compile_flags'
-    implies: 'sysroot'
-    implies: 'unfiltered_compile_flags'
-  }
-
-  action_config {
-    config_name: 'c++-compile'
-    action_name: 'c++-compile'
-    tool {
-      tool_path: ''
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'legacy_compile_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'parse_showincludes'
-    implies: 'user_compile_flags'
-    implies: 'sysroot'
-    implies: 'unfiltered_compile_flags'
-  }
-
-  action_config {
-    config_name: 'c++-link-executable'
-    action_name: 'c++-link-executable'
-    tool {
-      tool_path: ''
-    }
-    implies: 'nologo'
-    implies: 'linkstamps'
-    implies: 'output_execpath_flags'
-    implies: 'input_param_flags'
-    implies: 'user_link_flags'
-    implies: 'legacy_link_flags'
-    implies: 'linker_subsystem_flag'
-    implies: 'linker_param_file'
-    implies: 'msvc_env'
-    implies: 'no_stripping'
-  }
-
-  action_config {
-    config_name: 'c++-link-dynamic-library'
-    action_name: 'c++-link-dynamic-library'
-    tool {
-      tool_path: ''
-    }
-    implies: 'nologo'
-    implies: 'shared_flag'
-    implies: 'linkstamps'
-    implies: 'output_execpath_flags'
-    implies: 'input_param_flags'
-    implies: 'user_link_flags'
-    implies: 'legacy_link_flags'
-    implies: 'linker_subsystem_flag'
-    implies: 'linker_param_file'
-    implies: 'msvc_env'
-    implies: 'no_stripping'
-    implies: 'has_configured_linker_path'
-    implies: 'def_file'
-  }
-
-  action_config {
-      config_name: 'c++-link-nodeps-dynamic-library'
-      action_name: 'c++-link-nodeps-dynamic-library'
-      tool {
-        tool_path: ''
-      }
-      implies: 'nologo'
-      implies: 'shared_flag'
-      implies: 'linkstamps'
-      implies: 'output_execpath_flags'
-      implies: 'input_param_flags'
-      implies: 'user_link_flags'
-      implies: 'legacy_link_flags'
-      implies: 'linker_subsystem_flag'
-      implies: 'linker_param_file'
-      implies: 'msvc_env'
-      implies: 'no_stripping'
-      implies: 'has_configured_linker_path'
-      implies: 'def_file'
-    }
-
-  action_config {
-    config_name: 'c++-link-static-library'
-    action_name: 'c++-link-static-library'
-    tool {
-      tool_path: ''
-    }
-    implies: 'nologo'
-    implies: 'archiver_flags'
-    implies: 'input_param_flags'
-    implies: 'linker_param_file'
-    implies: 'msvc_env'
-  }
-
-  # TODO(b/65151735): Remove legacy_compile_flags feature when legacy fields are
-  # not used in this crosstool
-  feature {
-    name: 'legacy_compile_flags'
-    flag_set {
-      expand_if_all_available: 'legacy_compile_flags'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        iterate_over: 'legacy_compile_flags'
-        flag: '%{legacy_compile_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: "msvc_env"
-    env_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-module-compile"
-      action: "c++-module-codegen"
-      action: "c++-header-parsing"
-      action: "assemble"
-      action: "preprocess-assemble"
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-static-library"
-      env_entry {
-        key: "PATH"
-        value: ""
-      }
-      env_entry {
-        key: "INCLUDE"
-        value: ""
-      }
-      env_entry {
-        key: "LIB"
-        value: ""
-      }
-      env_entry {
-        key: "TMP"
-        value: ""
-      }
-      env_entry {
-        key: "TEMP"
-        value: ""
-      }
-    }
-  }
-
-  feature {
-    name: 'include_paths'
-    flag_set {
-      action: "assemble"
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      flag_group {
-        iterate_over: 'quote_include_paths'
-        flag: '/I%{quote_include_paths}'
-      }
-      flag_group {
-        iterate_over: 'include_paths'
-        flag: '/I%{include_paths}'
-      }
-      flag_group {
-        iterate_over: 'system_include_paths'
-        flag: '/I%{system_include_paths}'
-      }
-    }
-  }
-
-  feature {
-    name: "preprocessor_defines"
-    flag_set {
-      action: "assemble"
-      action: "preprocess-assemble"
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-header-parsing"
-      action: "c++-module-compile"
-      flag_group {
-        flag: "/D%{preprocessor_defines}"
-        iterate_over: "preprocessor_defines"
-      }
-    }
-  }
-
-  # Tell Bazel to parse the output of /showIncludes
-  feature {
-    name: 'parse_showincludes'
-    flag_set {
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-module-compile'
-      action: 'c++-header-parsing'
-      flag_group {
-        flag: "/showIncludes"
-      }
-    }
-  }
-
-
-  feature {
-    name: 'generate_pdb_file'
-    requires: {
-      feature: 'dbg'
-    }
-    requires: {
-      feature: 'fastbuild'
-    }
-  }
-
-  feature {
-    name: 'shared_flag'
-    flag_set {
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: '/DLL'
-      }
-    }
-  }
-
-  feature {
-    name: 'linkstamps'
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      expand_if_all_available: 'linkstamp_paths'
-      flag_group {
-        iterate_over: 'linkstamp_paths'
-        flag: '%{linkstamp_paths}'
-      }
-    }
-  }
-
-  feature {
-    name: 'output_execpath_flags'
-    flag_set {
-      expand_if_all_available: 'output_execpath'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: '/OUT:%{output_execpath}'
-      }
-    }
-  }
-
-  feature {
-    name: 'archiver_flags'
-    flag_set {
-      expand_if_all_available: 'output_execpath'
-      action: 'c++-link-static-library'
-      flag_group {
-        flag: '/OUT:%{output_execpath}'
-      }
-    }
-  }
-
-  feature {
-    name: 'input_param_flags'
-    flag_set {
-      expand_if_all_available: 'interface_library_output_path'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/IMPLIB:%{interface_library_output_path}"
-      }
-    }
-    flag_set {
-      expand_if_all_available: 'libopts'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'libopts'
-        flag: '%{libopts}'
-      }
-    }
-    flag_set {
-      expand_if_all_available: 'libraries_to_link'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      action: 'c++-link-static-library'
-      flag_group {
-        iterate_over: 'libraries_to_link'
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'object_file_group'
-          }
-          iterate_over: 'libraries_to_link.object_files'
-          flag_group {
-            flag: '%{libraries_to_link.object_files}'
-          }
-        }
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'object_file'
-          }
-          flag_group {
-            flag: '%{libraries_to_link.name}'
-          }
-        }
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'interface_library'
-          }
-          flag_group {
-            flag: '%{libraries_to_link.name}'
-          }
-        }
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'static_library'
-          }
-          flag_group {
-            expand_if_false: 'libraries_to_link.is_whole_archive'
-            flag: '%{libraries_to_link.name}'
-          }
-          flag_group {
-            expand_if_true: 'libraries_to_link.is_whole_archive'
-            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
-          }
-        }
-      }
-    }
-  }
-
-  # Since this feature is declared earlier in the CROSSTOOL than
-  # "user_link_flags", this feature will be applied prior to it anwyhere they
-  # are both implied. And since "user_link_flags" contains the linkopts from
-  # the build rule, this allows the user to override the /SUBSYSTEM in the BUILD
-  # file.
-  feature {
-    name: 'linker_subsystem_flag'
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: '/SUBSYSTEM:CONSOLE'
-      }
-    }
-  }
-
-  # The "user_link_flags" contains user-defined linkopts (from build rules)
-  # so it should be defined after features that declare user-overridable flags.
-  # For example the "linker_subsystem_flag" defines a default "/SUBSYSTEM" flag
-  # but we want to let the user override it, therefore "link_flag_subsystem" is
-  # defined earlier in the CROSSTOOL file than "user_link_flags".
-  feature {
-    name: 'user_link_flags'
-    flag_set {
-      expand_if_all_available: 'user_link_flags'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'user_link_flags'
-        flag: '%{user_link_flags}'
-      }
-    }
-  }
-  feature {
-    name: 'legacy_link_flags'
-    flag_set {
-      expand_if_all_available: 'legacy_link_flags'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'legacy_link_flags'
-        flag: '%{legacy_link_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: 'linker_param_file'
-    flag_set {
-      expand_if_all_available: 'linker_param_file'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      action: 'c++-link-static-library'
-      flag_group {
-        flag: '@%{linker_param_file}'
-      }
-    }
-  }
-
-  feature {
-    name: 'static_link_msvcrt'
-  }
-
-  feature {
-    name: 'static_link_msvcrt_no_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MT"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:libcmt.lib"
-      }
-    }
-    requires: { feature: 'fastbuild'}
-    requires: { feature: 'opt'}
-  }
-
-  feature {
-    name: 'dynamic_link_msvcrt_no_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MD"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:msvcrt.lib"
-      }
-    }
-    requires: { feature: 'fastbuild'}
-    requires: { feature: 'opt'}
-  }
-
-  feature {
-    name: 'static_link_msvcrt_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MTd"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:libcmtd.lib"
-      }
-    }
-    requires: { feature: 'dbg'}
-  }
-
-  feature {
-    name: 'dynamic_link_msvcrt_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MDd"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:msvcrtd.lib"
-      }
-    }
-    requires: { feature: 'dbg'}
-  }
-
-  feature {
-    name: 'dbg'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/Od"
-        flag: "/Z7"
-        flag: "/DDEBUG"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEBUG:FULL"
-        flag: "/INCREMENTAL:NO"
-      }
-    }
-    implies: 'generate_pdb_file'
-  }
-
-  feature {
-    name: 'fastbuild'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/Od"
-        flag: "/Z7"
-        flag: "/DDEBUG"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEBUG:FASTLINK"
-        flag: "/INCREMENTAL:NO"
-      }
-    }
-    implies: 'generate_pdb_file'
-  }
-
-  feature {
-    name: 'opt'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/O2"
-        flag: "/DNDEBUG"
-      }
-    }
-  }
-
-  feature {
-    name: 'user_compile_flags'
-    flag_set {
-      expand_if_all_available: 'user_compile_flags'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        iterate_over: 'user_compile_flags'
-        flag: '%{user_compile_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: 'sysroot'
-    flag_set {
-      expand_if_all_available: 'sysroot'
-      action: 'assemble'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'sysroot'
-        flag: '--sysroot=%{sysroot}'
-      }
-    }
-  }
-
-  feature {
-    name: 'unfiltered_compile_flags'
-    flag_set {
-      expand_if_all_available: 'unfiltered_compile_flags'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        iterate_over: 'unfiltered_compile_flags'
-        flag: '%{unfiltered_compile_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: 'compiler_output_flags'
-    flag_set {
-      action: 'assemble'
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_none_available: 'output_assembly_file'
-        expand_if_none_available: 'output_preprocess_file'
-        flag: '/Fo%{output_file}'
-        flag: '/Zi'
-      }
-    }
-    flag_set {
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_none_available: 'output_assembly_file'
-        expand_if_none_available: 'output_preprocess_file'
-        flag: '/Fo%{output_file}'
-      }
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_all_available: 'output_assembly_file'
-        flag: '/Fa%{output_file}'
-      }
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_all_available: 'output_preprocess_file'
-        flag: '/P'
-        flag: '/Fi%{output_file}'
-      }
-    }
-  }
-
-  feature {
-    name: 'compiler_input_flags'
-    flag_set {
-      action: 'assemble'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        expand_if_all_available: 'source_file'
-        flag: '/c'
-        flag: '%{source_file}'
-      }
-    }
-  }
-
-  feature {
-    name : 'def_file',
-    flag_set {
-      expand_if_all_available: 'def_file_path'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEF:%{def_file_path}"
-        # We can specify a different DLL name in DEF file, /ignore:4070 suppresses
-        # the warning message about DLL name doesn't match the default one.
-        # See https://msdn.microsoft.com/en-us/library/sfkk2fz7.aspx
-        flag: "/ignore:4070"
-      }
-    }
-  }
-
-  feature {
-    name: 'windows_export_all_symbols'
-  }
-
-  feature {
-    name: 'no_windows_export_all_symbols'
-  }
-
-  linking_mode_flags { mode: DYNAMIC }
-}
diff --git a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/cc_toolchain_config.bzl b/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/cc_toolchain_config.bzl
deleted file mode 100755
index 282ba08cda5..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/cc_toolchain_config.bzl
+++ /dev/null
@@ -1,1486 +0,0 @@
-"""cc_toolchain_config rule for configuring CUDA toolchains on Linux, Mac, and Windows."""
-
-load(
-    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
-    "action_config",
-    "env_entry",
-    "env_set",
-    "feature",
-    "feature_set",
-    "flag_group",
-    "flag_set",
-    "tool",
-    "tool_path",
-    "variable_with_value",
-)
-load(
-    "@bazel_tools//tools/build_defs/cc:action_names.bzl",
-    "ASSEMBLE_ACTION_NAME",
-    "CC_FLAGS_MAKE_VARIABLE_ACTION_NAME",
-    "CLIF_MATCH_ACTION_NAME",
-    "CPP_COMPILE_ACTION_NAME",
-    "CPP_HEADER_PARSING_ACTION_NAME",
-    "CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME",
-    "CPP_LINK_EXECUTABLE_ACTION_NAME",
-    "CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME",
-    "CPP_LINK_STATIC_LIBRARY_ACTION_NAME",
-    "CPP_MODULE_CODEGEN_ACTION_NAME",
-    "CPP_MODULE_COMPILE_ACTION_NAME",
-    "C_COMPILE_ACTION_NAME",
-    "LINKSTAMP_COMPILE_ACTION_NAME",
-    "LTO_BACKEND_ACTION_NAME",
-    "LTO_INDEXING_ACTION_NAME",
-    "OBJCPP_COMPILE_ACTION_NAME",
-    "OBJCPP_EXECUTABLE_ACTION_NAME",
-    "OBJC_ARCHIVE_ACTION_NAME",
-    "OBJC_COMPILE_ACTION_NAME",
-    "OBJC_EXECUTABLE_ACTION_NAME",
-    "OBJC_FULLY_LINK_ACTION_NAME",
-    "PREPROCESS_ASSEMBLE_ACTION_NAME",
-    "STRIP_ACTION_NAME",
-)
-
-ACTION_NAMES = struct(
-    c_compile = C_COMPILE_ACTION_NAME,
-    cpp_compile = CPP_COMPILE_ACTION_NAME,
-    linkstamp_compile = LINKSTAMP_COMPILE_ACTION_NAME,
-    cc_flags_make_variable = CC_FLAGS_MAKE_VARIABLE_ACTION_NAME,
-    cpp_module_codegen = CPP_MODULE_CODEGEN_ACTION_NAME,
-    cpp_header_parsing = CPP_HEADER_PARSING_ACTION_NAME,
-    cpp_module_compile = CPP_MODULE_COMPILE_ACTION_NAME,
-    assemble = ASSEMBLE_ACTION_NAME,
-    preprocess_assemble = PREPROCESS_ASSEMBLE_ACTION_NAME,
-    lto_indexing = LTO_INDEXING_ACTION_NAME,
-    lto_backend = LTO_BACKEND_ACTION_NAME,
-    cpp_link_executable = CPP_LINK_EXECUTABLE_ACTION_NAME,
-    cpp_link_dynamic_library = CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME,
-    cpp_link_nodeps_dynamic_library = CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME,
-    cpp_link_static_library = CPP_LINK_STATIC_LIBRARY_ACTION_NAME,
-    strip = STRIP_ACTION_NAME,
-    objc_archive = OBJC_ARCHIVE_ACTION_NAME,
-    objc_compile = OBJC_COMPILE_ACTION_NAME,
-    objc_executable = OBJC_EXECUTABLE_ACTION_NAME,
-    objc_fully_link = OBJC_FULLY_LINK_ACTION_NAME,
-    objcpp_compile = OBJCPP_COMPILE_ACTION_NAME,
-    objcpp_executable = OBJCPP_EXECUTABLE_ACTION_NAME,
-    clif_match = CLIF_MATCH_ACTION_NAME,
-    objcopy_embed_data = "objcopy_embed_data",
-    ld_embed_data = "ld_embed_data",
-)
-
-def _impl(ctx):
-    if (ctx.attr.cpu == "darwin"):
-        toolchain_identifier = "local_darwin"
-    elif (ctx.attr.cpu == "local"):
-        toolchain_identifier = "local_linux"
-    elif (ctx.attr.cpu == "x64_windows"):
-        toolchain_identifier = "local_windows"
-    else:
-        fail("Unreachable")
-
-    host_system_name = "local"
-
-    target_system_name = "local"
-
-    if (ctx.attr.cpu == "darwin"):
-        target_cpu = "darwin"
-    elif (ctx.attr.cpu == "local"):
-        target_cpu = "local"
-    elif (ctx.attr.cpu == "x64_windows"):
-        target_cpu = "x64_windows"
-    else:
-        fail("Unreachable")
-
-    if (ctx.attr.cpu == "local"):
-        target_libc = "local"
-    elif (ctx.attr.cpu == "darwin"):
-        target_libc = "macosx"
-    elif (ctx.attr.cpu == "x64_windows"):
-        target_libc = "msvcrt"
-    else:
-        fail("Unreachable")
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        compiler = "compiler"
-    elif (ctx.attr.cpu == "x64_windows"):
-        compiler = "msvc-cl"
-    else:
-        fail("Unreachable")
-
-    abi_version = "local"
-
-    abi_libc_version = "local"
-
-    cc_target_os = None
-
-    builtin_sysroot = None
-
-    all_link_actions = [
-        ACTION_NAMES.cpp_link_executable,
-        ACTION_NAMES.cpp_link_dynamic_library,
-        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-    ]
-
-    cpp_link_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    cpp_link_nodeps_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    cpp_link_static_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_static_library,
-        implies = [
-            "nologo",
-            "archiver_flags",
-            "input_param_flags",
-            "linker_param_file",
-            "msvc_env",
-        ],
-        tools = [tool(path = ctx.attr.msvc_lib_path)],
-    )
-
-    assemble_action = action_config(
-        action_name = ACTION_NAMES.assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = ctx.attr.msvc_ml_path)],
-    )
-
-    preprocess_assemble_action = action_config(
-        action_name = ACTION_NAMES.preprocess_assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = ctx.attr.msvc_ml_path)],
-    )
-
-    c_compile_action = action_config(
-        action_name = ACTION_NAMES.c_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = ctx.attr.msvc_cl_path)],
-    )
-
-    cpp_compile_action = action_config(
-        action_name = ACTION_NAMES.cpp_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = ctx.attr.msvc_cl_path)],
-    )
-
-    cpp_link_executable_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_executable,
-        implies = [
-            "nologo",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        action_configs = []
-    elif (ctx.attr.cpu == "x64_windows"):
-        action_configs = [
-            assemble_action,
-            preprocess_assemble_action,
-            c_compile_action,
-            cpp_compile_action,
-            cpp_link_executable_action,
-            cpp_link_dynamic_library_action,
-            cpp_link_nodeps_dynamic_library_action,
-            cpp_link_static_library_action,
-        ]
-    else:
-        fail("Unreachable")
-
-    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
-
-    pic_feature = feature(
-        name = "pic",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(flags = ["-fPIC"], expand_if_available = "pic"),
-                    flag_group(
-                        flags = ["-fPIE"],
-                        expand_if_not_available = "pic",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    preprocessor_defines_feature = feature(
-        name = "preprocessor_defines",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/D%{preprocessor_defines}"],
-                        iterate_over = "preprocessor_defines",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    generate_pdb_file_feature = feature(
-        name = "generate_pdb_file",
-        requires = [
-            feature_set(features = ["dbg"]),
-            feature_set(features = ["fastbuild"]),
-        ],
-    )
-
-    linkstamps_feature = feature(
-        name = "linkstamps",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{linkstamp_paths}"],
-                        iterate_over = "linkstamp_paths",
-                        expand_if_available = "linkstamp_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    unfiltered_compile_flags_feature = feature(
-        name = "unfiltered_compile_flags",
-        flag_sets = ([
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ctx.attr.host_unfiltered_compile_flags,
-                    ),
-                ],
-            ),
-        ] if ctx.attr.host_unfiltered_compile_flags else []),
-    )
-
-    determinism_feature = feature(
-        name = "determinism",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-Wno-builtin-macro-redefined",
-                            "-D__DATE__=\"redacted\"",
-                            "-D__TIMESTAMP__=\"redacted\"",
-                            "-D__TIME__=\"redacted\"",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    nologo_feature = feature(
-        name = "nologo",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                flag_groups = [flag_group(flags = ["/nologo"])],
-            ),
-        ],
-    )
-
-    supports_pic_feature = feature(name = "supports_pic", enabled = True)
-
-    output_execpath_flags_feature = feature(
-        name = "output_execpath_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    default_link_flags_feature = feature(
-        name = "default_link_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        hardening_feature = feature(
-            name = "hardening",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-U_FORTIFY_SOURCE",
-                                "-D_FORTIFY_SOURCE=1",
-                                "-fstack-protector",
-                            ],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [flag_group(flags = ["-Wl,-z,relro,-z,now"])],
-                ),
-                flag_set(
-                    actions = [ACTION_NAMES.cpp_link_executable],
-                    flag_groups = [flag_group(flags = ["-pie", "-Wl,-z,relro,-z,now"])],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        hardening_feature = feature(
-            name = "hardening",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-U_FORTIFY_SOURCE",
-                                "-D_FORTIFY_SOURCE=1",
-                                "-fstack-protector",
-                            ],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [ACTION_NAMES.cpp_link_executable],
-                    flag_groups = [flag_group(flags = ["-pie"])],
-                ),
-            ],
-        )
-    else:
-        hardening_feature = None
-
-    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
-
-    targets_windows_feature = feature(
-        name = "targets_windows",
-        enabled = True,
-        implies = ["copy_dynamic_libraries_to_binary"],
-    )
-
-    msvc_env_feature = feature(
-        name = "msvc_env",
-        env_sets = [
-            env_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                env_entries = [
-                    env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
-                    env_entry(
-                        key = "INCLUDE",
-                        value = ctx.attr.msvc_env_include,
-                    ),
-                    env_entry(key = "LIB", value = ctx.attr.msvc_env_lib),
-                    env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
-                    env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
-                ],
-            ),
-        ],
-    )
-
-    linker_subsystem_flag_feature = feature(
-        name = "linker_subsystem_flag",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_no_debug_feature = feature(
-        name = "dynamic_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MD"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    warnings_feature = feature(
-        name = "warnings",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wall"] + ctx.attr.host_compiler_warnings,
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_debug_feature = feature(
-        name = "dynamic_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MDd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    compiler_output_flags_feature = feature(
-        name = "compiler_output_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.assemble],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}", "/Zi"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fa%{output_file}"],
-                                expand_if_available = "output_assembly_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/P", "/Fi%{output_file}"],
-                                expand_if_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    default_compile_flags_feature = feature(
-        name = "default_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "/DCOMPILER_MSVC",
-                            "/DNOMINMAX",
-                            "/D_WIN32_WINNT=0x0600",
-                            "/D_CRT_SECURE_NO_DEPRECATE",
-                            "/D_CRT_SECURE_NO_WARNINGS",
-                            "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS",
-                            "/bigobj",
-                            "/Zm500",
-                            "/J",
-                            "/Gy",
-                            "/GF",
-                            "/EHsc",
-                            "/wd4351",
-                            "/wd4291",
-                            "/wd4250",
-                            "/wd4996",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_debug_feature = feature(
-        name = "static_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MTd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        dbg_feature = feature(
-            name = "dbg",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["-g"])],
-                ),
-            ],
-            implies = ["common"],
-        )
-    elif (ctx.attr.cpu == "x64_windows"):
-        dbg_feature = feature(
-            name = "dbg",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                ),
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])],
-                ),
-            ],
-            implies = ["generate_pdb_file"],
-        )
-    else:
-        dbg_feature = None
-
-    undefined_dynamic_feature = feature(
-        name = "undefined-dynamic",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
-            ),
-        ],
-    )
-
-    parse_showincludes_feature = feature(
-        name = "parse_showincludes",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                ],
-                flag_groups = [flag_group(flags = ["/showIncludes"])],
-            ),
-        ],
-    )
-
-    linker_param_file_feature = feature(
-        name = "linker_param_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["@%{linker_param_file}"],
-                        expand_if_available = "linker_param_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_no_debug_feature = feature(
-        name = "static_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MT"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    supports_interface_shared_libraries_feature = feature(
-        name = "supports_interface_shared_libraries",
-        enabled = True,
-    )
-
-    disable_assertions_feature = feature(
-        name = "disable-assertions",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-DNDEBUG"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "x64_windows"):
-        fastbuild_feature = feature(
-            name = "fastbuild",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                ),
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [
-                        flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]),
-                    ],
-                ),
-            ],
-            implies = ["generate_pdb_file"],
-        )
-    elif (ctx.attr.cpu == "darwin" or
-          ctx.attr.cpu == "local"):
-        fastbuild_feature = feature(name = "fastbuild", implies = ["common"])
-    else:
-        fastbuild_feature = None
-
-    user_compile_flags_feature = feature(
-        name = "user_compile_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_compile_flags}"],
-                        iterate_over = "user_compile_flags",
-                        expand_if_available = "user_compile_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    compiler_input_flags_feature = feature(
-        name = "compiler_input_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/c", "%{source_file}"],
-                        expand_if_available = "source_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    no_legacy_features_feature = feature(name = "no_legacy_features")
-
-    archiver_flags_feature = feature(
-        name = "archiver_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    redirector_feature = feature(
-        name = "redirector",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-B",
-                            "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    linker_bin_path_feature = feature(
-        name = "linker-bin-path",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["-B" + ctx.attr.linker_bin_path])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                        ACTION_NAMES.cpp_link_executable,
-                    ],
-                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
-                ),
-            ],
-            implies = ["common", "disable-assertions"],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
-                        ),
-                    ],
-                ),
-            ],
-            implies = ["common", "disable-assertions"],
-        )
-    elif (ctx.attr.cpu == "x64_windows"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])],
-                ),
-            ],
-        )
-    else:
-        opt_feature = None
-
-    include_paths_feature = feature(
-        name = "include_paths",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/I%{quote_include_paths}"],
-                        iterate_over = "quote_include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{include_paths}"],
-                        iterate_over = "include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{system_include_paths}"],
-                        iterate_over = "system_include_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    shared_flag_feature = feature(
-        name = "shared_flag",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [flag_group(flags = ["/DLL"])],
-            ),
-        ],
-    )
-
-    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
-
-    frame_pointer_feature = feature(
-        name = "frame-pointer",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-fno-omit-frame-pointer"])],
-            ),
-        ],
-    )
-
-    build_id_feature = feature(
-        name = "build-id",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    sysroot_feature = feature(
-        name = "sysroot",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["--sysroot=%{sysroot}"],
-                        iterate_over = "sysroot",
-                        expand_if_available = "sysroot",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    def_file_feature = feature(
-        name = "def_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
-                        expand_if_available = "def_file_path",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "darwin"):
-        stdlib_feature = feature(
-            name = "stdlib",
-            flag_sets = [
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["-lc++"])],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "local"):
-        stdlib_feature = feature(
-            name = "stdlib",
-            flag_sets = [
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["-lstdc++"])],
-                ),
-            ],
-        )
-    else:
-        stdlib_feature = None
-
-    no_stripping_feature = feature(name = "no_stripping")
-
-    alwayslink_feature = feature(
-        name = "alwayslink",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [flag_group(flags = ["-Wl,-no-as-needed"])],
-            ),
-        ],
-    )
-
-    input_param_flags_feature = feature(
-        name = "input_param_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/IMPLIB:%{interface_library_output_path}"],
-                        expand_if_available = "interface_library_output_path",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        iterate_over = "libraries_to_link",
-                        flag_groups = [
-                            flag_group(
-                                iterate_over = "libraries_to_link.object_files",
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file_group",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "interface_library",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["%{libraries_to_link.name}"],
-                                        expand_if_false = "libraries_to_link.is_whole_archive",
-                                    ),
-                                    flag_group(
-                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
-                                        expand_if_true = "libraries_to_link.is_whole_archive",
-                                    ),
-                                ],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "static_library",
-                                ),
-                            ),
-                        ],
-                        expand_if_available = "libraries_to_link",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        no_canonical_prefixes_feature = feature(
-            name = "no-canonical-prefixes",
-            flag_sets = [
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.c_compile,
-                        ACTION_NAMES.cpp_compile,
-                        ACTION_NAMES.cpp_link_executable,
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-no-canonical-prefixes",
-                            ] + ctx.attr.extra_no_canonical_prefixes_flags,
-                        ),
-                    ],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        no_canonical_prefixes_feature = feature(
-            name = "no-canonical-prefixes",
-            flag_sets = [
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.c_compile,
-                        ACTION_NAMES.cpp_compile,
-                        ACTION_NAMES.cpp_link_executable,
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [flag_group(flags = ["-no-canonical-prefixes"])],
-                ),
-            ],
-        )
-    else:
-        no_canonical_prefixes_feature = None
-
-    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
-
-    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
-
-    user_link_flags_feature = feature(
-        name = "user_link_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_link_flags}"],
-                        iterate_over = "user_link_flags",
-                        expand_if_available = "user_link_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    cpp11_feature = feature(
-        name = "c++11",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-std=c++11"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        common_feature = feature(
-            name = "common",
-            implies = [
-                "stdlib",
-                "c++11",
-                "determinism",
-                "alwayslink",
-                "hardening",
-                "warnings",
-                "frame-pointer",
-                "build-id",
-                "no-canonical-prefixes",
-                "linker-bin-path",
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        common_feature = feature(
-            name = "common",
-            implies = [
-                "stdlib",
-                "c++11",
-                "determinism",
-                "hardening",
-                "warnings",
-                "frame-pointer",
-                "no-canonical-prefixes",
-                "linker-bin-path",
-                "undefined-dynamic",
-            ],
-        )
-    else:
-        common_feature = None
-
-    if (ctx.attr.cpu == "local"):
-        features = [
-            cpp11_feature,
-            stdlib_feature,
-            determinism_feature,
-            alwayslink_feature,
-            pic_feature,
-            hardening_feature,
-            warnings_feature,
-            frame_pointer_feature,
-            build_id_feature,
-            no_canonical_prefixes_feature,
-            disable_assertions_feature,
-            linker_bin_path_feature,
-            common_feature,
-            opt_feature,
-            fastbuild_feature,
-            dbg_feature,
-            supports_dynamic_linker_feature,
-            supports_pic_feature,
-        ]
-    elif (ctx.attr.cpu == "darwin"):
-        features = [
-            cpp11_feature,
-            stdlib_feature,
-            determinism_feature,
-            pic_feature,
-            hardening_feature,
-            warnings_feature,
-            frame_pointer_feature,
-            no_canonical_prefixes_feature,
-            disable_assertions_feature,
-            linker_bin_path_feature,
-            undefined_dynamic_feature,
-            common_feature,
-            opt_feature,
-            fastbuild_feature,
-            dbg_feature,
-            supports_dynamic_linker_feature,
-            supports_pic_feature,
-        ]
-    elif (ctx.attr.cpu == "x64_windows"):
-        features = [
-            no_legacy_features_feature,
-            redirector_feature,
-            nologo_feature,
-            has_configured_linker_path_feature,
-            no_stripping_feature,
-            targets_windows_feature,
-            copy_dynamic_libraries_to_binary_feature,
-            default_compile_flags_feature,
-            msvc_env_feature,
-            include_paths_feature,
-            preprocessor_defines_feature,
-            parse_showincludes_feature,
-            generate_pdb_file_feature,
-            shared_flag_feature,
-            linkstamps_feature,
-            output_execpath_flags_feature,
-            archiver_flags_feature,
-            input_param_flags_feature,
-            linker_subsystem_flag_feature,
-            user_link_flags_feature,
-            default_link_flags_feature,
-            linker_param_file_feature,
-            static_link_msvcrt_feature,
-            static_link_msvcrt_no_debug_feature,
-            dynamic_link_msvcrt_no_debug_feature,
-            static_link_msvcrt_debug_feature,
-            dynamic_link_msvcrt_debug_feature,
-            dbg_feature,
-            fastbuild_feature,
-            opt_feature,
-            user_compile_flags_feature,
-            sysroot_feature,
-            unfiltered_compile_flags_feature,
-            compiler_output_flags_feature,
-            compiler_input_flags_feature,
-            def_file_feature,
-            windows_export_all_symbols_feature,
-            no_windows_export_all_symbols_feature,
-            supports_dynamic_linker_feature,
-            supports_interface_shared_libraries_feature,
-        ]
-    else:
-        fail("Unreachable")
-
-    cxx_builtin_include_directories = ctx.attr.builtin_include_directories
-
-    if (ctx.attr.cpu == "x64_windows"):
-        tool_paths = [
-            tool_path(name = "ar", path = ctx.attr.msvc_lib_path),
-            tool_path(name = "ml", path = ctx.attr.msvc_ml_path),
-            tool_path(name = "cpp", path = ctx.attr.msvc_cl_path),
-            tool_path(name = "gcc", path = ctx.attr.msvc_cl_path),
-            tool_path(name = "gcov", path = "wrapper/bin/msvc_nop.bat"),
-            tool_path(name = "ld", path = ctx.attr.msvc_link_path),
-            tool_path(name = "nm", path = "wrapper/bin/msvc_nop.bat"),
-            tool_path(
-                name = "objcopy",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-            tool_path(
-                name = "objdump",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-            tool_path(
-                name = "strip",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-        ]
-    elif (ctx.attr.cpu == "local"):
-        tool_paths = [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/ar"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
-        ]
-    elif (ctx.attr.cpu == "darwin"):
-        tool_paths = [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/libtool"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
-        ]
-    else:
-        fail("Unreachable")
-
-    out = ctx.actions.declare_file(ctx.label.name)
-    ctx.actions.write(out, "Fake executable")
-    return [
-        cc_common.create_cc_toolchain_config_info(
-            ctx = ctx,
-            features = features,
-            action_configs = action_configs,
-            artifact_name_patterns = [],
-            cxx_builtin_include_directories = cxx_builtin_include_directories,
-            toolchain_identifier = toolchain_identifier,
-            host_system_name = host_system_name,
-            target_system_name = target_system_name,
-            target_cpu = target_cpu,
-            target_libc = target_libc,
-            compiler = compiler,
-            abi_version = abi_version,
-            abi_libc_version = abi_libc_version,
-            tool_paths = tool_paths,
-            make_variables = [],
-            builtin_sysroot = builtin_sysroot,
-            cc_target_os = cc_target_os,
-        ),
-        DefaultInfo(
-            executable = out,
-        ),
-    ]
-
-cc_toolchain_config = rule(
-    implementation = _impl,
-    attrs = {
-        "cpu": attr.string(mandatory = True, values = ["darwin", "local", "x64_windows"]),
-        "builtin_include_directories": attr.string_list(),
-        "extra_no_canonical_prefixes_flags": attr.string_list(),
-        "host_compiler_path": attr.string(),
-        "host_compiler_prefix": attr.string(),
-        "host_compiler_warnings": attr.string_list(),
-        "host_unfiltered_compile_flags": attr.string_list(),
-        "linker_bin_path": attr.string(),
-        "msvc_cl_path": attr.string(default = "msvc_not_used"),
-        "msvc_env_include": attr.string(default = "msvc_not_used"),
-        "msvc_env_lib": attr.string(default = "msvc_not_used"),
-        "msvc_env_path": attr.string(default = "msvc_not_used"),
-        "msvc_env_tmp": attr.string(default = "msvc_not_used"),
-        "msvc_lib_path": attr.string(default = "msvc_not_used"),
-        "msvc_link_path": attr.string(default = "msvc_not_used"),
-        "msvc_ml_path": attr.string(default = "msvc_not_used"),
-    },
-    provides = [CcToolchainConfigInfo],
-    executable = True,
-)
diff --git a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
deleted file mode 100755
index d7dc3a6511a..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
+++ /dev/null
@@ -1,267 +0,0 @@
-#!/usr/bin/env python2
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Crosstool wrapper for compiling CUDA programs.
-
-SYNOPSIS:
-  crosstool_wrapper_is_not_gcc [options passed in by cc_library()
-                                or cc_binary() rule]
-
-DESCRIPTION:
-  This script is expected to be called by the cc_library() or cc_binary() bazel
-  rules. When the option "-x cuda" is present in the list of arguments passed
-  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
-  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
-  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
-  arguments as is.
-
-NOTES:
-  Changes to the contents of this file must be propagated from
-  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc to
-  //third_party/gpus/crosstool/v*/*/clang/bin/crosstool_wrapper_is_not_gcc
-"""
-
-from __future__ import print_function
-
-__author__ = 'keveman@google.com (Manjunath Kudlur)'
-
-from argparse import ArgumentParser
-import os
-import subprocess
-import re
-import sys
-import pipes
-
-# Template values set by cuda_autoconf.
-CPU_COMPILER = ('/opt/rh/devtoolset-7/root/usr/bin/gcc')
-GCC_HOST_COMPILER_PATH = ('/opt/rh/devtoolset-7/root/usr/bin/gcc')
-
-NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc'
-PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
-NVCC_VERSION = '10.0'
-
-def Log(s):
-  print('gpus/crosstool: {0}'.format(s))
-
-
-def GetOptionValue(argv, option):
-  """Extract the list of values for option from the argv list.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    option: The option whose value to extract, without the leading '-'.
-
-  Returns:
-    A list of values, either directly following the option,
-    (eg., -opt val1 val2) or values collected from multiple occurrences of
-    the option (eg., -opt val1 -opt val2).
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-' + option, nargs='*', action='append')
-  args, _ = parser.parse_known_args(argv)
-  if not args or not vars(args)[option]:
-    return []
-  else:
-    return sum(vars(args)[option], [])
-
-
-def GetHostCompilerOptions(argv):
-  """Collect the -isystem, -iquote, and --sysroot option values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    The string that can be used as the --compiler-options to nvcc.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-isystem', nargs='*', action='append')
-  parser.add_argument('-iquote', nargs='*', action='append')
-  parser.add_argument('--sysroot', nargs=1)
-  parser.add_argument('-g', nargs='*', action='append')
-  parser.add_argument('-fno-canonical-system-headers', action='store_true')
-  parser.add_argument('-no-canonical-prefixes', action='store_true')
-
-  args, _ = parser.parse_known_args(argv)
-
-  opts = ''
-
-  if args.isystem:
-    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
-  if args.iquote:
-    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
-  if args.g:
-    opts += ' -g' + ' -g'.join(sum(args.g, []))
-  if args.fno_canonical_system_headers:
-    opts += ' -fno-canonical-system-headers'
-  if args.no_canonical_prefixes:
-    opts += ' -no-canonical-prefixes'
-  if args.sysroot:
-    opts += ' --sysroot ' + args.sysroot[0]
-
-  return opts
-
-def _update_options(nvcc_options):
-  if NVCC_VERSION in ("7.0",):
-    return nvcc_options
-
-  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
-  return [ update_options[opt] if opt in update_options else opt
-                    for opt in nvcc_options ]
-
-def GetNvccOptions(argv):
-  """Collect the -nvcc_options values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    The string that can be passed directly to nvcc.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-nvcc_options', nargs='*', action='append')
-
-  args, _ = parser.parse_known_args(argv)
-
-  if args.nvcc_options:
-    options = _update_options(sum(args.nvcc_options, []))
-    return ' '.join(['--'+a for a in options])
-  return ''
-
-
-def InvokeNvcc(argv, log=False):
-  """Call nvcc with arguments assembled from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    log: True if logging is requested.
-
-  Returns:
-    The return value of calling os.system('nvcc ' + args)
-  """
-
-  host_compiler_options = GetHostCompilerOptions(argv)
-  nvcc_compiler_options = GetNvccOptions(argv)
-  opt_option = GetOptionValue(argv, 'O')
-  m_options = GetOptionValue(argv, 'm')
-  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
-  include_options = GetOptionValue(argv, 'I')
-  out_file = GetOptionValue(argv, 'o')
-  depfiles = GetOptionValue(argv, 'MF')
-  defines = GetOptionValue(argv, 'D')
-  defines = ''.join([' -D' + define for define in defines])
-  undefines = GetOptionValue(argv, 'U')
-  undefines = ''.join([' -U' + define for define in undefines])
-  std_options = GetOptionValue(argv, 'std')
-  # currently only c++11 is supported by Cuda 7.0 std argument
-  nvcc_allowed_std_options = ["c++11"]
-  std_options = ''.join([' -std=' + define
-      for define in std_options if define in nvcc_allowed_std_options])
-
-  # The list of source files get passed after the -c option. I don't know of
-  # any other reliable way to just get the list of source files to be compiled.
-  src_files = GetOptionValue(argv, 'c')
-
-  # Pass -w through from host to nvcc, but don't do anything fancier with
-  # warnings-related flags, since they're not necessarily the same across
-  # compilers.
-  warning_options = ' -w' if '-w' in argv else ''
-
-  if len(src_files) == 0:
-    return 1
-  if len(out_file) != 1:
-    return 1
-
-  opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0)
-         else ' -g -G')
-
-  includes = (' -I ' + ' -I '.join(include_options)
-              if len(include_options) > 0
-              else '')
-
-  # Unfortunately, there are other options that have -c prefix too.
-  # So allowing only those look like C/C++ files.
-  src_files = [f for f in src_files if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
-  srcs = ' '.join(src_files)
-  out = ' -o ' + out_file[0]
-
-  supported_cuda_compute_capabilities = [ "3.0", "6.0" ]
-  nvccopts = '-D_FORCE_INLINES '
-  for capability in supported_cuda_compute_capabilities:
-    capability = capability.replace('.', '')
-    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s,compute_%s\" ' % (
-        capability, capability, capability)
-  nvccopts += ' ' + nvcc_compiler_options
-  nvccopts += undefines
-  nvccopts += defines
-  nvccopts += std_options
-  nvccopts += m_options
-  nvccopts += warning_options
-
-  if depfiles:
-    # Generate the dependency file
-    depfile = depfiles[0]
-    cmd = (NVCC_PATH + ' ' + nvccopts +
-           ' --compiler-options "' + host_compiler_options + '"' +
-           ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
-           ' -I .' +
-           ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
-    if log: Log(cmd)
-    exit_status = os.system(cmd)
-    if exit_status != 0:
-      return exit_status
-
-  cmd = (NVCC_PATH + ' ' + nvccopts +
-         ' --compiler-options "' + host_compiler_options + ' -fPIC"' +
-         ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
-         ' -I .' +
-         ' -x cu ' + opt + includes + ' -c ' + srcs + out)
-
-  # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
-  # Need to investigate and fix.
-  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
-  if log: Log(cmd)
-  return os.system(cmd)
-
-
-def main():
-  parser = ArgumentParser()
-  parser.add_argument('-x', nargs=1)
-  parser.add_argument('--cuda_log', action='store_true')
-  args, leftover = parser.parse_known_args(sys.argv[1:])
-
-  if args.x and args.x[0] == 'cuda':
-    if args.cuda_log: Log('-x cuda')
-    leftover = [pipes.quote(s) for s in leftover]
-    if args.cuda_log: Log('using nvcc')
-    return InvokeNvcc(leftover, log=args.cuda_log)
-
-  # Strip our flags before passing through to the CPU compiler for files which
-  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
-  # We not only want to pass -x to the CPU compiler, but also keep it in its
-  # relative location in the argv list (the compiler is actually sensitive to
-  # this).
-  cpu_compiler_flags = [flag for flag in sys.argv[1:]
-                             if not flag.startswith(('--cuda_log'))]
-
-  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
deleted file mode 100755
index 69fb0713d78..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
+++ /dev/null
@@ -1,192 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Crosstool wrapper for compiling CUDA programs with nvcc on Windows.
-
-DESCRIPTION:
-  This script is the Windows version of //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc
-"""
-
-from __future__ import print_function
-
-from argparse import ArgumentParser
-import os
-import subprocess
-import re
-import sys
-import pipes
-
-# Template values set by cuda_autoconf.
-CPU_COMPILER = ('/opt/rh/devtoolset-7/root/usr/bin/gcc')
-GCC_HOST_COMPILER_PATH = ('/opt/rh/devtoolset-7/root/usr/bin/gcc')
-
-NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc'
-NVCC_VERSION = '10.0'
-NVCC_TEMP_DIR = "C:\\Windows\\Temp\\nvcc_inter_files_tmp_dir"
-supported_cuda_compute_capabilities = [ "3.0", "6.0" ]
-
-def Log(s):
-  print('gpus/crosstool: {0}'.format(s))
-
-
-def GetOptionValue(argv, option):
-  """Extract the list of values for option from options.
-
-  Args:
-    option: The option whose value to extract, without the leading '/'.
-
-  Returns:
-    1. A list of values, either directly following the option,
-    (eg., /opt val1 val2) or values collected from multiple occurrences of
-    the option (eg., /opt val1 /opt val2).
-    2. The leftover options.
-  """
-
-  parser = ArgumentParser(prefix_chars='/')
-  parser.add_argument('/' + option, nargs='*', action='append')
-  args, leftover = parser.parse_known_args(argv)
-  if args and vars(args)[option]:
-    return (sum(vars(args)[option], []), leftover)
-  return ([], leftover)
-
-def _update_options(nvcc_options):
-  if NVCC_VERSION in ("7.0",):
-    return nvcc_options
-
-  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
-  return [ update_options[opt] if opt in update_options else opt
-                    for opt in nvcc_options ]
-
-def GetNvccOptions(argv):
-  """Collect the -nvcc_options values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    1. The string that can be passed directly to nvcc.
-    2. The leftover options.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-nvcc_options', nargs='*', action='append')
-
-  args, leftover = parser.parse_known_args(argv)
-
-  if args.nvcc_options:
-    options = _update_options(sum(args.nvcc_options, []))
-    return (['--' + a for a in options], leftover)
-  return ([], leftover)
-
-
-def InvokeNvcc(argv, log=False):
-  """Call nvcc with arguments assembled from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    log: True if logging is requested.
-
-  Returns:
-    The return value of calling os.system('nvcc ' + args)
-  """
-
-  src_files = [f for f in argv if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
-  if len(src_files) == 0:
-    raise Error('No source files found for cuda compilation.')
-
-  out_file = [ f for f in argv if f.startswith('/Fo') ]
-  if len(out_file) != 1:
-    raise Error('Please specify exactly one output file for cuda compilation.')
-  out = ['-o', out_file[0][len('/Fo'):]]
-
-  nvcc_compiler_options, argv = GetNvccOptions(argv)
-
-  opt_option, argv = GetOptionValue(argv, 'O')
-  opt = ['-g', '-G']
-  if (len(opt_option) > 0 and opt_option[0] != 'd'):
-    opt = ['-O2']
-
-  include_options, argv = GetOptionValue(argv, 'I')
-  includes = ["-I " + include for include in include_options]
-
-  defines, argv = GetOptionValue(argv, 'D')
-  defines = ['-D' + define for define in defines]
-
-  undefines, argv = GetOptionValue(argv, 'U')
-  undefines = ['-U' + define for define in undefines]
-
-  # The rest of the unrecognized options should be passed to host compiler
-  host_compiler_options = [option for option in argv if option not in (src_files + out_file)]
-
-  m_options = ["-m64"]
-
-  nvccopts = ['-D_FORCE_INLINES']
-  for capability in supported_cuda_compute_capabilities:
-    capability = capability.replace('.', '')
-    nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % (
-        capability, capability, capability)]
-  nvccopts += nvcc_compiler_options
-  nvccopts += undefines
-  nvccopts += defines
-  nvccopts += m_options
-  nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"']
-  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
-  # If we don't specify --keep-dir, nvcc will generate intermediate files under TEMP
-  # Put them under NVCC_TEMP_DIR instead, then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
-  # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver
-  # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists.
-  if os.path.isfile(NVCC_TEMP_DIR):
-    os.remove(NVCC_TEMP_DIR)
-  if not os.path.exists(NVCC_TEMP_DIR):
-    os.makedirs(NVCC_TEMP_DIR)
-  nvccopts += ['--keep', '--keep-dir', NVCC_TEMP_DIR]
-  cmd = [NVCC_PATH] + nvccopts
-  if log:
-    Log(cmd)
-  proc = subprocess.Popen(cmd,
-                          stdout=sys.stdout,
-                          stderr=sys.stderr,
-                          env=os.environ.copy(),
-                          shell=True)
-  proc.wait()
-  return proc.returncode
-
-def main():
-  parser = ArgumentParser()
-  parser.add_argument('-x', nargs=1)
-  parser.add_argument('--cuda_log', action='store_true')
-  args, leftover = parser.parse_known_args(sys.argv[1:])
-
-  if args.x and args.x[0] == 'cuda':
-    if args.cuda_log: Log('-x cuda')
-    leftover = [pipes.quote(s) for s in leftover]
-    if args.cuda_log: Log('using nvcc')
-    return InvokeNvcc(leftover, log=args.cuda_log)
-
-  # Strip our flags before passing through to the CPU compiler for files which
-  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
-  # We not only want to pass -x to the CPU compiler, but also keep it in its
-  # relative location in the argv list (the compiler is actually sensitive to
-  # this).
-  cpu_compiler_flags = [flag for flag in sys.argv[1:]
-                             if not flag.startswith(('--cuda_log'))
-                             and not flag.startswith(('-nvcc_options'))]
-
-  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/BUILD b/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/BUILD
deleted file mode 100755
index a38be3f0373..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/BUILD
+++ /dev/null
@@ -1,170 +0,0 @@
-# This file is expanded from a template by cuda_configure.bzl
-# Update cuda_configure.bzl#verify_build_defines when adding new variables.
-
-load(":cc_toolchain_config.bzl", "cc_toolchain_config")
-
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-toolchain(
-    name = "toolchain-linux-x86_64",
-    exec_compatible_with = [
-        "@bazel_tools//platforms:linux",
-        "@bazel_tools//platforms:x86_64",
-    ],
-    target_compatible_with = [
-        "@bazel_tools//platforms:linux",
-        "@bazel_tools//platforms:x86_64",
-    ],
-    toolchain = ":cc-compiler-local",
-    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
-)
-
-cc_toolchain_suite(
-    name = "toolchain",
-    toolchains = {
-        "local|compiler": ":cc-compiler-local",
-        "darwin|compiler": ":cc-compiler-darwin",
-        "x64_windows|msvc-cl": ":cc-compiler-windows",
-        "x64_windows": ":cc-compiler-windows",
-        "arm": ":cc-compiler-local",
-        "aarch64": ":cc-compiler-local",
-        "k8": ":cc-compiler-local",
-        "piii": ":cc-compiler-local",
-        "ppc": ":cc-compiler-local",
-        "darwin": ":cc-compiler-darwin",
-    },
-)
-
-cc_toolchain(
-    name = "cc-compiler-local",
-    all_files = ":crosstool_wrapper_driver_is_not_gcc",
-    compiler_files = ":empty",
-    dwp_files = ":empty",
-    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    # To support linker flags that need to go to the start of command line
-    # we need the toolchain to support parameter files. Parameter files are
-    # last on the command line and contain all shared libraries to link, so all
-    # regular options will be left of them.
-    supports_param_files = 1,
-    toolchain_config = ":cc-compiler-local-config",
-    toolchain_identifier = "local_linux",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-local-config",
-    builtin_include_directories = [
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/backward",
-        "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include",
-        "/usr/local/include",
-        "/opt/rh/devtoolset-7/root/usr/include",
-        "/usr/include",
-        "/usr/local/cuda-10.1/targets/x86_64-linux/include",
-        "/usr/local/cuda-10.1/include",
-        "/usr/local/cuda-10.1/extras/CUPTI/include",
-        "/usr/local/cuda-10.1/include",
-    ],
-    cpu = "local",
-    extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
-    host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc",
-    host_compiler_prefix = "/opt/rh/devtoolset-7/root/usr/bin",
-    host_compiler_warnings = [],
-    host_unfiltered_compile_flags = [],
-    linker_bin_path = "/opt/rh/devtoolset-7/root/usr/bin",
-)
-
-cc_toolchain(
-    name = "cc-compiler-darwin",
-    all_files = ":crosstool_wrapper_driver_is_not_gcc",
-    compiler_files = ":empty",
-    dwp_files = ":empty",
-    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 0,
-    toolchain_config = ":cc-compiler-local-darwin",
-    toolchain_identifier = "local_darwin",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-local-darwin",
-    builtin_include_directories = [
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/backward",
-        "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include",
-        "/usr/local/include",
-        "/opt/rh/devtoolset-7/root/usr/include",
-        "/usr/include",
-        "/usr/local/cuda-10.1/targets/x86_64-linux/include",
-        "/usr/local/cuda-10.1/include",
-        "/usr/local/cuda-10.1/extras/CUPTI/include",
-        "/usr/local/cuda-10.1/include",
-    ],
-    cpu = "darwin",
-    extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
-    host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc",
-    host_compiler_prefix = "/opt/rh/devtoolset-7/root/usr/bin",
-    host_compiler_warnings = [],
-    host_unfiltered_compile_flags = [],
-    linker_bin_path = "/opt/rh/devtoolset-7/root/usr/bin",
-)
-
-cc_toolchain(
-    name = "cc-compiler-windows",
-    all_files = ":windows_msvc_wrapper_files",
-    compiler_files = ":empty",
-    dwp_files = ":empty",
-    linker_files = ":windows_msvc_wrapper_files",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 1,
-    toolchain_config = ":cc-compiler-windows-config",
-    toolchain_identifier = "local_windows",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-windows-config",
-    builtin_include_directories = [
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/backward",
-        "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include",
-        "/usr/local/include",
-        "/opt/rh/devtoolset-7/root/usr/include",
-        "/usr/include",
-        "/usr/local/cuda-10.1/targets/x86_64-linux/include",
-        "/usr/local/cuda-10.1/include",
-        "/usr/local/cuda-10.1/extras/CUPTI/include",
-        "/usr/local/cuda-10.1/include",
-    ],
-    cpu = "x64_windows",
-    msvc_cl_path = "msvc_not_used",
-    msvc_env_include = "msvc_not_used",
-    msvc_env_lib = "msvc_not_used",
-    msvc_env_path = "msvc_not_used",
-    msvc_env_tmp = "msvc_not_used",
-    msvc_lib_path = "msvc_not_used",
-    msvc_link_path = "msvc_not_used",
-    msvc_ml_path = "msvc_not_used",
-)
-
-filegroup(
-    name = "empty",
-    srcs = [],
-)
-
-filegroup(
-    name = "crosstool_wrapper_driver_is_not_gcc",
-    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
-)
-
-filegroup(
-    name = "windows_msvc_wrapper_files",
-    srcs = glob(["windows/msvc_*"]),
-)
diff --git a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/cc_toolchain_config.bzl b/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/cc_toolchain_config.bzl
deleted file mode 100755
index 282ba08cda5..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/cc_toolchain_config.bzl
+++ /dev/null
@@ -1,1486 +0,0 @@
-"""cc_toolchain_config rule for configuring CUDA toolchains on Linux, Mac, and Windows."""
-
-load(
-    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
-    "action_config",
-    "env_entry",
-    "env_set",
-    "feature",
-    "feature_set",
-    "flag_group",
-    "flag_set",
-    "tool",
-    "tool_path",
-    "variable_with_value",
-)
-load(
-    "@bazel_tools//tools/build_defs/cc:action_names.bzl",
-    "ASSEMBLE_ACTION_NAME",
-    "CC_FLAGS_MAKE_VARIABLE_ACTION_NAME",
-    "CLIF_MATCH_ACTION_NAME",
-    "CPP_COMPILE_ACTION_NAME",
-    "CPP_HEADER_PARSING_ACTION_NAME",
-    "CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME",
-    "CPP_LINK_EXECUTABLE_ACTION_NAME",
-    "CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME",
-    "CPP_LINK_STATIC_LIBRARY_ACTION_NAME",
-    "CPP_MODULE_CODEGEN_ACTION_NAME",
-    "CPP_MODULE_COMPILE_ACTION_NAME",
-    "C_COMPILE_ACTION_NAME",
-    "LINKSTAMP_COMPILE_ACTION_NAME",
-    "LTO_BACKEND_ACTION_NAME",
-    "LTO_INDEXING_ACTION_NAME",
-    "OBJCPP_COMPILE_ACTION_NAME",
-    "OBJCPP_EXECUTABLE_ACTION_NAME",
-    "OBJC_ARCHIVE_ACTION_NAME",
-    "OBJC_COMPILE_ACTION_NAME",
-    "OBJC_EXECUTABLE_ACTION_NAME",
-    "OBJC_FULLY_LINK_ACTION_NAME",
-    "PREPROCESS_ASSEMBLE_ACTION_NAME",
-    "STRIP_ACTION_NAME",
-)
-
-ACTION_NAMES = struct(
-    c_compile = C_COMPILE_ACTION_NAME,
-    cpp_compile = CPP_COMPILE_ACTION_NAME,
-    linkstamp_compile = LINKSTAMP_COMPILE_ACTION_NAME,
-    cc_flags_make_variable = CC_FLAGS_MAKE_VARIABLE_ACTION_NAME,
-    cpp_module_codegen = CPP_MODULE_CODEGEN_ACTION_NAME,
-    cpp_header_parsing = CPP_HEADER_PARSING_ACTION_NAME,
-    cpp_module_compile = CPP_MODULE_COMPILE_ACTION_NAME,
-    assemble = ASSEMBLE_ACTION_NAME,
-    preprocess_assemble = PREPROCESS_ASSEMBLE_ACTION_NAME,
-    lto_indexing = LTO_INDEXING_ACTION_NAME,
-    lto_backend = LTO_BACKEND_ACTION_NAME,
-    cpp_link_executable = CPP_LINK_EXECUTABLE_ACTION_NAME,
-    cpp_link_dynamic_library = CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME,
-    cpp_link_nodeps_dynamic_library = CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME,
-    cpp_link_static_library = CPP_LINK_STATIC_LIBRARY_ACTION_NAME,
-    strip = STRIP_ACTION_NAME,
-    objc_archive = OBJC_ARCHIVE_ACTION_NAME,
-    objc_compile = OBJC_COMPILE_ACTION_NAME,
-    objc_executable = OBJC_EXECUTABLE_ACTION_NAME,
-    objc_fully_link = OBJC_FULLY_LINK_ACTION_NAME,
-    objcpp_compile = OBJCPP_COMPILE_ACTION_NAME,
-    objcpp_executable = OBJCPP_EXECUTABLE_ACTION_NAME,
-    clif_match = CLIF_MATCH_ACTION_NAME,
-    objcopy_embed_data = "objcopy_embed_data",
-    ld_embed_data = "ld_embed_data",
-)
-
-def _impl(ctx):
-    if (ctx.attr.cpu == "darwin"):
-        toolchain_identifier = "local_darwin"
-    elif (ctx.attr.cpu == "local"):
-        toolchain_identifier = "local_linux"
-    elif (ctx.attr.cpu == "x64_windows"):
-        toolchain_identifier = "local_windows"
-    else:
-        fail("Unreachable")
-
-    host_system_name = "local"
-
-    target_system_name = "local"
-
-    if (ctx.attr.cpu == "darwin"):
-        target_cpu = "darwin"
-    elif (ctx.attr.cpu == "local"):
-        target_cpu = "local"
-    elif (ctx.attr.cpu == "x64_windows"):
-        target_cpu = "x64_windows"
-    else:
-        fail("Unreachable")
-
-    if (ctx.attr.cpu == "local"):
-        target_libc = "local"
-    elif (ctx.attr.cpu == "darwin"):
-        target_libc = "macosx"
-    elif (ctx.attr.cpu == "x64_windows"):
-        target_libc = "msvcrt"
-    else:
-        fail("Unreachable")
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        compiler = "compiler"
-    elif (ctx.attr.cpu == "x64_windows"):
-        compiler = "msvc-cl"
-    else:
-        fail("Unreachable")
-
-    abi_version = "local"
-
-    abi_libc_version = "local"
-
-    cc_target_os = None
-
-    builtin_sysroot = None
-
-    all_link_actions = [
-        ACTION_NAMES.cpp_link_executable,
-        ACTION_NAMES.cpp_link_dynamic_library,
-        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-    ]
-
-    cpp_link_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    cpp_link_nodeps_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    cpp_link_static_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_static_library,
-        implies = [
-            "nologo",
-            "archiver_flags",
-            "input_param_flags",
-            "linker_param_file",
-            "msvc_env",
-        ],
-        tools = [tool(path = ctx.attr.msvc_lib_path)],
-    )
-
-    assemble_action = action_config(
-        action_name = ACTION_NAMES.assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = ctx.attr.msvc_ml_path)],
-    )
-
-    preprocess_assemble_action = action_config(
-        action_name = ACTION_NAMES.preprocess_assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = ctx.attr.msvc_ml_path)],
-    )
-
-    c_compile_action = action_config(
-        action_name = ACTION_NAMES.c_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = ctx.attr.msvc_cl_path)],
-    )
-
-    cpp_compile_action = action_config(
-        action_name = ACTION_NAMES.cpp_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = ctx.attr.msvc_cl_path)],
-    )
-
-    cpp_link_executable_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_executable,
-        implies = [
-            "nologo",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        action_configs = []
-    elif (ctx.attr.cpu == "x64_windows"):
-        action_configs = [
-            assemble_action,
-            preprocess_assemble_action,
-            c_compile_action,
-            cpp_compile_action,
-            cpp_link_executable_action,
-            cpp_link_dynamic_library_action,
-            cpp_link_nodeps_dynamic_library_action,
-            cpp_link_static_library_action,
-        ]
-    else:
-        fail("Unreachable")
-
-    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
-
-    pic_feature = feature(
-        name = "pic",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(flags = ["-fPIC"], expand_if_available = "pic"),
-                    flag_group(
-                        flags = ["-fPIE"],
-                        expand_if_not_available = "pic",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    preprocessor_defines_feature = feature(
-        name = "preprocessor_defines",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/D%{preprocessor_defines}"],
-                        iterate_over = "preprocessor_defines",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    generate_pdb_file_feature = feature(
-        name = "generate_pdb_file",
-        requires = [
-            feature_set(features = ["dbg"]),
-            feature_set(features = ["fastbuild"]),
-        ],
-    )
-
-    linkstamps_feature = feature(
-        name = "linkstamps",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{linkstamp_paths}"],
-                        iterate_over = "linkstamp_paths",
-                        expand_if_available = "linkstamp_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    unfiltered_compile_flags_feature = feature(
-        name = "unfiltered_compile_flags",
-        flag_sets = ([
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ctx.attr.host_unfiltered_compile_flags,
-                    ),
-                ],
-            ),
-        ] if ctx.attr.host_unfiltered_compile_flags else []),
-    )
-
-    determinism_feature = feature(
-        name = "determinism",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-Wno-builtin-macro-redefined",
-                            "-D__DATE__=\"redacted\"",
-                            "-D__TIMESTAMP__=\"redacted\"",
-                            "-D__TIME__=\"redacted\"",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    nologo_feature = feature(
-        name = "nologo",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                flag_groups = [flag_group(flags = ["/nologo"])],
-            ),
-        ],
-    )
-
-    supports_pic_feature = feature(name = "supports_pic", enabled = True)
-
-    output_execpath_flags_feature = feature(
-        name = "output_execpath_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    default_link_flags_feature = feature(
-        name = "default_link_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        hardening_feature = feature(
-            name = "hardening",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-U_FORTIFY_SOURCE",
-                                "-D_FORTIFY_SOURCE=1",
-                                "-fstack-protector",
-                            ],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [flag_group(flags = ["-Wl,-z,relro,-z,now"])],
-                ),
-                flag_set(
-                    actions = [ACTION_NAMES.cpp_link_executable],
-                    flag_groups = [flag_group(flags = ["-pie", "-Wl,-z,relro,-z,now"])],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        hardening_feature = feature(
-            name = "hardening",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-U_FORTIFY_SOURCE",
-                                "-D_FORTIFY_SOURCE=1",
-                                "-fstack-protector",
-                            ],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [ACTION_NAMES.cpp_link_executable],
-                    flag_groups = [flag_group(flags = ["-pie"])],
-                ),
-            ],
-        )
-    else:
-        hardening_feature = None
-
-    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
-
-    targets_windows_feature = feature(
-        name = "targets_windows",
-        enabled = True,
-        implies = ["copy_dynamic_libraries_to_binary"],
-    )
-
-    msvc_env_feature = feature(
-        name = "msvc_env",
-        env_sets = [
-            env_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                env_entries = [
-                    env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
-                    env_entry(
-                        key = "INCLUDE",
-                        value = ctx.attr.msvc_env_include,
-                    ),
-                    env_entry(key = "LIB", value = ctx.attr.msvc_env_lib),
-                    env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
-                    env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
-                ],
-            ),
-        ],
-    )
-
-    linker_subsystem_flag_feature = feature(
-        name = "linker_subsystem_flag",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_no_debug_feature = feature(
-        name = "dynamic_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MD"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    warnings_feature = feature(
-        name = "warnings",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wall"] + ctx.attr.host_compiler_warnings,
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_debug_feature = feature(
-        name = "dynamic_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MDd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    compiler_output_flags_feature = feature(
-        name = "compiler_output_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.assemble],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}", "/Zi"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fa%{output_file}"],
-                                expand_if_available = "output_assembly_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/P", "/Fi%{output_file}"],
-                                expand_if_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    default_compile_flags_feature = feature(
-        name = "default_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "/DCOMPILER_MSVC",
-                            "/DNOMINMAX",
-                            "/D_WIN32_WINNT=0x0600",
-                            "/D_CRT_SECURE_NO_DEPRECATE",
-                            "/D_CRT_SECURE_NO_WARNINGS",
-                            "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS",
-                            "/bigobj",
-                            "/Zm500",
-                            "/J",
-                            "/Gy",
-                            "/GF",
-                            "/EHsc",
-                            "/wd4351",
-                            "/wd4291",
-                            "/wd4250",
-                            "/wd4996",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_debug_feature = feature(
-        name = "static_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MTd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        dbg_feature = feature(
-            name = "dbg",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["-g"])],
-                ),
-            ],
-            implies = ["common"],
-        )
-    elif (ctx.attr.cpu == "x64_windows"):
-        dbg_feature = feature(
-            name = "dbg",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                ),
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])],
-                ),
-            ],
-            implies = ["generate_pdb_file"],
-        )
-    else:
-        dbg_feature = None
-
-    undefined_dynamic_feature = feature(
-        name = "undefined-dynamic",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
-            ),
-        ],
-    )
-
-    parse_showincludes_feature = feature(
-        name = "parse_showincludes",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                ],
-                flag_groups = [flag_group(flags = ["/showIncludes"])],
-            ),
-        ],
-    )
-
-    linker_param_file_feature = feature(
-        name = "linker_param_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["@%{linker_param_file}"],
-                        expand_if_available = "linker_param_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_no_debug_feature = feature(
-        name = "static_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MT"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    supports_interface_shared_libraries_feature = feature(
-        name = "supports_interface_shared_libraries",
-        enabled = True,
-    )
-
-    disable_assertions_feature = feature(
-        name = "disable-assertions",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-DNDEBUG"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "x64_windows"):
-        fastbuild_feature = feature(
-            name = "fastbuild",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                ),
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [
-                        flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]),
-                    ],
-                ),
-            ],
-            implies = ["generate_pdb_file"],
-        )
-    elif (ctx.attr.cpu == "darwin" or
-          ctx.attr.cpu == "local"):
-        fastbuild_feature = feature(name = "fastbuild", implies = ["common"])
-    else:
-        fastbuild_feature = None
-
-    user_compile_flags_feature = feature(
-        name = "user_compile_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_compile_flags}"],
-                        iterate_over = "user_compile_flags",
-                        expand_if_available = "user_compile_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    compiler_input_flags_feature = feature(
-        name = "compiler_input_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/c", "%{source_file}"],
-                        expand_if_available = "source_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    no_legacy_features_feature = feature(name = "no_legacy_features")
-
-    archiver_flags_feature = feature(
-        name = "archiver_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    redirector_feature = feature(
-        name = "redirector",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-B",
-                            "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    linker_bin_path_feature = feature(
-        name = "linker-bin-path",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["-B" + ctx.attr.linker_bin_path])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                        ACTION_NAMES.cpp_link_executable,
-                    ],
-                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
-                ),
-            ],
-            implies = ["common", "disable-assertions"],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
-                        ),
-                    ],
-                ),
-            ],
-            implies = ["common", "disable-assertions"],
-        )
-    elif (ctx.attr.cpu == "x64_windows"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])],
-                ),
-            ],
-        )
-    else:
-        opt_feature = None
-
-    include_paths_feature = feature(
-        name = "include_paths",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/I%{quote_include_paths}"],
-                        iterate_over = "quote_include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{include_paths}"],
-                        iterate_over = "include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{system_include_paths}"],
-                        iterate_over = "system_include_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    shared_flag_feature = feature(
-        name = "shared_flag",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [flag_group(flags = ["/DLL"])],
-            ),
-        ],
-    )
-
-    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
-
-    frame_pointer_feature = feature(
-        name = "frame-pointer",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-fno-omit-frame-pointer"])],
-            ),
-        ],
-    )
-
-    build_id_feature = feature(
-        name = "build-id",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    sysroot_feature = feature(
-        name = "sysroot",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["--sysroot=%{sysroot}"],
-                        iterate_over = "sysroot",
-                        expand_if_available = "sysroot",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    def_file_feature = feature(
-        name = "def_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
-                        expand_if_available = "def_file_path",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "darwin"):
-        stdlib_feature = feature(
-            name = "stdlib",
-            flag_sets = [
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["-lc++"])],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "local"):
-        stdlib_feature = feature(
-            name = "stdlib",
-            flag_sets = [
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["-lstdc++"])],
-                ),
-            ],
-        )
-    else:
-        stdlib_feature = None
-
-    no_stripping_feature = feature(name = "no_stripping")
-
-    alwayslink_feature = feature(
-        name = "alwayslink",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [flag_group(flags = ["-Wl,-no-as-needed"])],
-            ),
-        ],
-    )
-
-    input_param_flags_feature = feature(
-        name = "input_param_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/IMPLIB:%{interface_library_output_path}"],
-                        expand_if_available = "interface_library_output_path",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        iterate_over = "libraries_to_link",
-                        flag_groups = [
-                            flag_group(
-                                iterate_over = "libraries_to_link.object_files",
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file_group",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "interface_library",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["%{libraries_to_link.name}"],
-                                        expand_if_false = "libraries_to_link.is_whole_archive",
-                                    ),
-                                    flag_group(
-                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
-                                        expand_if_true = "libraries_to_link.is_whole_archive",
-                                    ),
-                                ],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "static_library",
-                                ),
-                            ),
-                        ],
-                        expand_if_available = "libraries_to_link",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        no_canonical_prefixes_feature = feature(
-            name = "no-canonical-prefixes",
-            flag_sets = [
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.c_compile,
-                        ACTION_NAMES.cpp_compile,
-                        ACTION_NAMES.cpp_link_executable,
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-no-canonical-prefixes",
-                            ] + ctx.attr.extra_no_canonical_prefixes_flags,
-                        ),
-                    ],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        no_canonical_prefixes_feature = feature(
-            name = "no-canonical-prefixes",
-            flag_sets = [
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.c_compile,
-                        ACTION_NAMES.cpp_compile,
-                        ACTION_NAMES.cpp_link_executable,
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [flag_group(flags = ["-no-canonical-prefixes"])],
-                ),
-            ],
-        )
-    else:
-        no_canonical_prefixes_feature = None
-
-    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
-
-    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
-
-    user_link_flags_feature = feature(
-        name = "user_link_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_link_flags}"],
-                        iterate_over = "user_link_flags",
-                        expand_if_available = "user_link_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    cpp11_feature = feature(
-        name = "c++11",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-std=c++11"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        common_feature = feature(
-            name = "common",
-            implies = [
-                "stdlib",
-                "c++11",
-                "determinism",
-                "alwayslink",
-                "hardening",
-                "warnings",
-                "frame-pointer",
-                "build-id",
-                "no-canonical-prefixes",
-                "linker-bin-path",
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        common_feature = feature(
-            name = "common",
-            implies = [
-                "stdlib",
-                "c++11",
-                "determinism",
-                "hardening",
-                "warnings",
-                "frame-pointer",
-                "no-canonical-prefixes",
-                "linker-bin-path",
-                "undefined-dynamic",
-            ],
-        )
-    else:
-        common_feature = None
-
-    if (ctx.attr.cpu == "local"):
-        features = [
-            cpp11_feature,
-            stdlib_feature,
-            determinism_feature,
-            alwayslink_feature,
-            pic_feature,
-            hardening_feature,
-            warnings_feature,
-            frame_pointer_feature,
-            build_id_feature,
-            no_canonical_prefixes_feature,
-            disable_assertions_feature,
-            linker_bin_path_feature,
-            common_feature,
-            opt_feature,
-            fastbuild_feature,
-            dbg_feature,
-            supports_dynamic_linker_feature,
-            supports_pic_feature,
-        ]
-    elif (ctx.attr.cpu == "darwin"):
-        features = [
-            cpp11_feature,
-            stdlib_feature,
-            determinism_feature,
-            pic_feature,
-            hardening_feature,
-            warnings_feature,
-            frame_pointer_feature,
-            no_canonical_prefixes_feature,
-            disable_assertions_feature,
-            linker_bin_path_feature,
-            undefined_dynamic_feature,
-            common_feature,
-            opt_feature,
-            fastbuild_feature,
-            dbg_feature,
-            supports_dynamic_linker_feature,
-            supports_pic_feature,
-        ]
-    elif (ctx.attr.cpu == "x64_windows"):
-        features = [
-            no_legacy_features_feature,
-            redirector_feature,
-            nologo_feature,
-            has_configured_linker_path_feature,
-            no_stripping_feature,
-            targets_windows_feature,
-            copy_dynamic_libraries_to_binary_feature,
-            default_compile_flags_feature,
-            msvc_env_feature,
-            include_paths_feature,
-            preprocessor_defines_feature,
-            parse_showincludes_feature,
-            generate_pdb_file_feature,
-            shared_flag_feature,
-            linkstamps_feature,
-            output_execpath_flags_feature,
-            archiver_flags_feature,
-            input_param_flags_feature,
-            linker_subsystem_flag_feature,
-            user_link_flags_feature,
-            default_link_flags_feature,
-            linker_param_file_feature,
-            static_link_msvcrt_feature,
-            static_link_msvcrt_no_debug_feature,
-            dynamic_link_msvcrt_no_debug_feature,
-            static_link_msvcrt_debug_feature,
-            dynamic_link_msvcrt_debug_feature,
-            dbg_feature,
-            fastbuild_feature,
-            opt_feature,
-            user_compile_flags_feature,
-            sysroot_feature,
-            unfiltered_compile_flags_feature,
-            compiler_output_flags_feature,
-            compiler_input_flags_feature,
-            def_file_feature,
-            windows_export_all_symbols_feature,
-            no_windows_export_all_symbols_feature,
-            supports_dynamic_linker_feature,
-            supports_interface_shared_libraries_feature,
-        ]
-    else:
-        fail("Unreachable")
-
-    cxx_builtin_include_directories = ctx.attr.builtin_include_directories
-
-    if (ctx.attr.cpu == "x64_windows"):
-        tool_paths = [
-            tool_path(name = "ar", path = ctx.attr.msvc_lib_path),
-            tool_path(name = "ml", path = ctx.attr.msvc_ml_path),
-            tool_path(name = "cpp", path = ctx.attr.msvc_cl_path),
-            tool_path(name = "gcc", path = ctx.attr.msvc_cl_path),
-            tool_path(name = "gcov", path = "wrapper/bin/msvc_nop.bat"),
-            tool_path(name = "ld", path = ctx.attr.msvc_link_path),
-            tool_path(name = "nm", path = "wrapper/bin/msvc_nop.bat"),
-            tool_path(
-                name = "objcopy",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-            tool_path(
-                name = "objdump",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-            tool_path(
-                name = "strip",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-        ]
-    elif (ctx.attr.cpu == "local"):
-        tool_paths = [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/ar"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
-        ]
-    elif (ctx.attr.cpu == "darwin"):
-        tool_paths = [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/libtool"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
-        ]
-    else:
-        fail("Unreachable")
-
-    out = ctx.actions.declare_file(ctx.label.name)
-    ctx.actions.write(out, "Fake executable")
-    return [
-        cc_common.create_cc_toolchain_config_info(
-            ctx = ctx,
-            features = features,
-            action_configs = action_configs,
-            artifact_name_patterns = [],
-            cxx_builtin_include_directories = cxx_builtin_include_directories,
-            toolchain_identifier = toolchain_identifier,
-            host_system_name = host_system_name,
-            target_system_name = target_system_name,
-            target_cpu = target_cpu,
-            target_libc = target_libc,
-            compiler = compiler,
-            abi_version = abi_version,
-            abi_libc_version = abi_libc_version,
-            tool_paths = tool_paths,
-            make_variables = [],
-            builtin_sysroot = builtin_sysroot,
-            cc_target_os = cc_target_os,
-        ),
-        DefaultInfo(
-            executable = out,
-        ),
-    ]
-
-cc_toolchain_config = rule(
-    implementation = _impl,
-    attrs = {
-        "cpu": attr.string(mandatory = True, values = ["darwin", "local", "x64_windows"]),
-        "builtin_include_directories": attr.string_list(),
-        "extra_no_canonical_prefixes_flags": attr.string_list(),
-        "host_compiler_path": attr.string(),
-        "host_compiler_prefix": attr.string(),
-        "host_compiler_warnings": attr.string_list(),
-        "host_unfiltered_compile_flags": attr.string_list(),
-        "linker_bin_path": attr.string(),
-        "msvc_cl_path": attr.string(default = "msvc_not_used"),
-        "msvc_env_include": attr.string(default = "msvc_not_used"),
-        "msvc_env_lib": attr.string(default = "msvc_not_used"),
-        "msvc_env_path": attr.string(default = "msvc_not_used"),
-        "msvc_env_tmp": attr.string(default = "msvc_not_used"),
-        "msvc_lib_path": attr.string(default = "msvc_not_used"),
-        "msvc_link_path": attr.string(default = "msvc_not_used"),
-        "msvc_ml_path": attr.string(default = "msvc_not_used"),
-    },
-    provides = [CcToolchainConfigInfo],
-    executable = True,
-)
diff --git a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/clang/bin/crosstool_wrapper_driver_is_not_gcc
deleted file mode 100755
index 72a1fd95a95..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/clang/bin/crosstool_wrapper_driver_is_not_gcc
+++ /dev/null
@@ -1,280 +0,0 @@
-#!/usr/bin/env python2
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Crosstool wrapper for compiling CUDA programs.
-
-SYNOPSIS:
-  crosstool_wrapper_is_not_gcc [options passed in by cc_library()
-                                or cc_binary() rule]
-
-DESCRIPTION:
-  This script is expected to be called by the cc_library() or cc_binary() bazel
-  rules. When the option "-x cuda" is present in the list of arguments passed
-  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
-  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
-  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
-  arguments as is.
-
-NOTES:
-  Changes to the contents of this file must be propagated from
-  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc to
-  //third_party/gpus/crosstool/v*/*/clang/bin/crosstool_wrapper_is_not_gcc
-"""
-
-from __future__ import print_function
-
-__author__ = 'keveman@google.com (Manjunath Kudlur)'
-
-from argparse import ArgumentParser
-import os
-import subprocess
-import re
-import sys
-import pipes
-
-# Template values set by cuda_autoconf.
-CPU_COMPILER = ('/opt/rh/devtoolset-7/root/usr/bin/gcc')
-GCC_HOST_COMPILER_PATH = ('/opt/rh/devtoolset-7/root/usr/bin/gcc')
-
-NVCC_PATH = '/usr/local/cuda-10.1/bin/nvcc'
-PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
-NVCC_VERSION = '10.1'
-
-
-def Log(s):
-  print('gpus/crosstool: {0}'.format(s))
-
-
-def GetOptionValue(argv, option):
-  """Extract the list of values for option from the argv list.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    option: The option whose value to extract, without the leading '-'.
-
-  Returns:
-    A list of values, either directly following the option,
-    (eg., -opt val1 val2) or values collected from multiple occurrences of
-    the option (eg., -opt val1 -opt val2).
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-' + option, nargs='*', action='append')
-  args, _ = parser.parse_known_args(argv)
-  if not args or not vars(args)[option]:
-    return []
-  else:
-    return sum(vars(args)[option], [])
-
-
-def GetHostCompilerOptions(argv):
-  """Collect the -isystem, -iquote, and --sysroot option values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    The string that can be used as the --compiler-options to nvcc.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-isystem', nargs='*', action='append')
-  parser.add_argument('-iquote', nargs='*', action='append')
-  parser.add_argument('--sysroot', nargs=1)
-  parser.add_argument('-g', nargs='*', action='append')
-  parser.add_argument('-fno-canonical-system-headers', action='store_true')
-  parser.add_argument('-no-canonical-prefixes', action='store_true')
-
-  args, _ = parser.parse_known_args(argv)
-
-  opts = ''
-
-  if args.isystem:
-    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
-  if args.iquote:
-    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
-  if args.g:
-    opts += ' -g' + ' -g'.join(sum(args.g, []))
-  if args.fno_canonical_system_headers:
-    opts += ' -fno-canonical-system-headers'
-  if args.no_canonical_prefixes:
-    opts += ' -no-canonical-prefixes'
-  if args.sysroot:
-    opts += ' --sysroot ' + args.sysroot[0]
-
-  return opts
-
-
-def _update_options(nvcc_options):
-  if NVCC_VERSION in ('7.0',):
-    return nvcc_options
-
-  update_options = {'relaxed-constexpr': 'expt-relaxed-constexpr'}
-  return [
-      update_options[opt] if opt in update_options else opt
-      for opt in nvcc_options
-  ]
-
-
-def GetNvccOptions(argv):
-  """Collect the -nvcc_options values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    The string that can be passed directly to nvcc.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-nvcc_options', nargs='*', action='append')
-
-  args, _ = parser.parse_known_args(argv)
-
-  if args.nvcc_options:
-    options = _update_options(sum(args.nvcc_options, []))
-    return ' '.join(['--' + a for a in options])
-  return ''
-
-
-def InvokeNvcc(argv, log=False):
-  """Call nvcc with arguments assembled from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    log: True if logging is requested.
-
-  Returns:
-    The return value of calling os.system('nvcc ' + args)
-  """
-
-  host_compiler_options = GetHostCompilerOptions(argv)
-  nvcc_compiler_options = GetNvccOptions(argv)
-  opt_option = GetOptionValue(argv, 'O')
-  m_options = GetOptionValue(argv, 'm')
-  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
-  include_options = GetOptionValue(argv, 'I')
-  out_file = GetOptionValue(argv, 'o')
-  depfiles = GetOptionValue(argv, 'MF')
-  defines = GetOptionValue(argv, 'D')
-  defines = ''.join([' -D' + define for define in defines])
-  undefines = GetOptionValue(argv, 'U')
-  undefines = ''.join([' -U' + define for define in undefines])
-  std_options = GetOptionValue(argv, 'std')
-  # currently only c++11 is supported by Cuda 7.0 std argument
-  nvcc_allowed_std_options = ['c++11']
-  std_options = ''.join([
-      ' -std=' + define
-      for define in std_options
-      if define in nvcc_allowed_std_options
-  ])
-
-  # The list of source files get passed after the -c option. I don't know of
-  # any other reliable way to just get the list of source files to be compiled.
-  src_files = GetOptionValue(argv, 'c')
-
-  # Pass -w through from host to nvcc, but don't do anything fancier with
-  # warnings-related flags, since they're not necessarily the same across
-  # compilers.
-  warning_options = ' -w' if '-w' in argv else ''
-
-  if len(src_files) == 0:
-    return 1
-  if len(out_file) != 1:
-    return 1
-
-  opt = (' -O2' if
-         (len(opt_option) > 0 and int(opt_option[0]) > 0) else ' -g -G')
-
-  includes = (' -I ' +
-              ' -I '.join(include_options) if len(include_options) > 0 else '')
-
-  # Unfortunately, there are other options that have -c prefix too.
-  # So allowing only those look like C/C++ files.
-  src_files = [
-      f for f in src_files if re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)
-  ]
-  srcs = ' '.join(src_files)
-  out = ' -o ' + out_file[0]
-
-  supported_cuda_compute_capabilities = ['3.0', '6.0']
-  nvccopts = '-D_FORCE_INLINES '
-  for capability in supported_cuda_compute_capabilities:
-    capability = capability.replace('.', '')
-    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s,compute_%s\" ' % (
-        capability, capability, capability)
-  nvccopts += ' ' + nvcc_compiler_options
-  nvccopts += undefines
-  nvccopts += defines
-  nvccopts += std_options
-  nvccopts += m_options
-  nvccopts += warning_options
-
-  if depfiles:
-    # Generate the dependency file
-    depfile = depfiles[0]
-    cmd = (
-        NVCC_PATH + ' ' + nvccopts + ' --compiler-options "' +
-        host_compiler_options + '"' + ' --compiler-bindir=' +
-        GCC_HOST_COMPILER_PATH + ' -I .' + ' -x cu ' + opt + includes + ' ' +
-        srcs + ' -M -o ' + depfile)
-    if log:
-      Log(cmd)
-    exit_status = os.system(cmd)
-    if exit_status != 0:
-      return exit_status
-
-  cmd = (
-      NVCC_PATH + ' ' + nvccopts + ' --compiler-options "' +
-      host_compiler_options + ' -fPIC"' + ' --compiler-bindir=' +
-      GCC_HOST_COMPILER_PATH + ' -I .' + ' -x cu ' + opt + includes + ' -c ' +
-      srcs + out)
-
-  # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
-  # Need to investigate and fix.
-  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
-  if log:
-    Log(cmd)
-  return os.system(cmd)
-
-
-def main():
-  parser = ArgumentParser()
-  parser.add_argument('-x', nargs=1)
-  parser.add_argument('--cuda_log', action='store_true')
-  args, leftover = parser.parse_known_args(sys.argv[1:])
-
-  if args.x and args.x[0] == 'cuda':
-    if args.cuda_log:
-      Log('-x cuda')
-    leftover = [pipes.quote(s) for s in leftover]
-    if args.cuda_log:
-      Log('using nvcc')
-    return InvokeNvcc(leftover, log=args.cuda_log)
-
-  # Strip our flags before passing through to the CPU compiler for files which
-  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
-  # We not only want to pass -x to the CPU compiler, but also keep it in its
-  # relative location in the argv list (the compiler is actually sensitive to
-  # this).
-  cpu_compiler_flags = [
-      flag for flag in sys.argv[1:] if not flag.startswith(('--cuda_log'))
-  ]
-
-  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
-
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/windows/msvc_wrapper_for_nvcc.py
deleted file mode 100755
index dfd63dd7968..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7-nvcc-cuda10.1/windows/msvc_wrapper_for_nvcc.py
+++ /dev/null
@@ -1,207 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Crosstool wrapper for compiling CUDA programs with nvcc on Windows.
-
-DESCRIPTION:
-  This script is the Windows version of
-  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc
-"""
-
-from __future__ import print_function
-
-from argparse import ArgumentParser
-import os
-import subprocess
-import re
-import sys
-import pipes
-
-# Template values set by cuda_autoconf.
-CPU_COMPILER = ('/opt/rh/devtoolset-7/root/usr/bin/gcc')
-GCC_HOST_COMPILER_PATH = ('/opt/rh/devtoolset-7/root/usr/bin/gcc')
-
-NVCC_PATH = '/usr/local/cuda-10.1/bin/nvcc'
-NVCC_VERSION = '10.1'
-NVCC_TEMP_DIR = 'C:\\Windows\\Temp\\nvcc_inter_files_tmp_dir'
-supported_cuda_compute_capabilities = ['3.0', '6.0']
-
-
-def Log(s):
-  print('gpus/crosstool: {0}'.format(s))
-
-
-def GetOptionValue(argv, option):
-  """Extract the list of values for option from options.
-
-  Args:
-    option: The option whose value to extract, without the leading '/'.
-
-  Returns:
-    1. A list of values, either directly following the option,
-    (eg., /opt val1 val2) or values collected from multiple occurrences of
-    the option (eg., /opt val1 /opt val2).
-    2. The leftover options.
-  """
-
-  parser = ArgumentParser(prefix_chars='/')
-  parser.add_argument('/' + option, nargs='*', action='append')
-  args, leftover = parser.parse_known_args(argv)
-  if args and vars(args)[option]:
-    return (sum(vars(args)[option], []), leftover)
-  return ([], leftover)
-
-
-def _update_options(nvcc_options):
-  if NVCC_VERSION in ('7.0',):
-    return nvcc_options
-
-  update_options = {'relaxed-constexpr': 'expt-relaxed-constexpr'}
-  return [
-      update_options[opt] if opt in update_options else opt
-      for opt in nvcc_options
-  ]
-
-
-def GetNvccOptions(argv):
-  """Collect the -nvcc_options values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    1. The string that can be passed directly to nvcc.
-    2. The leftover options.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-nvcc_options', nargs='*', action='append')
-
-  args, leftover = parser.parse_known_args(argv)
-
-  if args.nvcc_options:
-    options = _update_options(sum(args.nvcc_options, []))
-    return (['--' + a for a in options], leftover)
-  return ([], leftover)
-
-
-def InvokeNvcc(argv, log=False):
-  """Call nvcc with arguments assembled from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    log: True if logging is requested.
-
-  Returns:
-    The return value of calling os.system('nvcc ' + args)
-  """
-
-  src_files = [f for f in argv if re.search(r'\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
-  if len(src_files) == 0:
-    raise RuntimeError('No source files found for cuda compilation.')
-
-  out_file = [f for f in argv if f.startswith('/Fo')]
-  if len(out_file) != 1:
-    raise RuntimeError(
-        'Please specify exactly one output file for cuda compilation.')
-  out = ['-o', out_file[0][len('/Fo'):]]
-
-  nvcc_compiler_options, argv = GetNvccOptions(argv)
-
-  opt_option, argv = GetOptionValue(argv, 'O')
-  opt = ['-g', '-G']
-  if (len(opt_option) > 0 and opt_option[0] != 'd'):
-    opt = ['-O2']
-
-  include_options, argv = GetOptionValue(argv, 'I')
-  includes = ['-I ' + include for include in include_options]
-
-  defines, argv = GetOptionValue(argv, 'D')
-  defines = ['-D' + define for define in defines]
-
-  undefines, argv = GetOptionValue(argv, 'U')
-  undefines = ['-U' + define for define in undefines]
-
-  # The rest of the unrecognized options should be passed to host compiler
-  host_compiler_options = [
-      option for option in argv if option not in (src_files + out_file)
-  ]
-
-  m_options = ['-m64']
-
-  nvccopts = ['-D_FORCE_INLINES']
-  for capability in supported_cuda_compute_capabilities:
-    capability = capability.replace('.', '')
-    nvccopts += [
-        r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' %
-        (capability, capability, capability)
-    ]
-  nvccopts += nvcc_compiler_options
-  nvccopts += undefines
-  nvccopts += defines
-  nvccopts += m_options
-  nvccopts += ['--compiler-options="' + ' '.join(host_compiler_options) + '"']
-  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
-  # If we don't specify --keep-dir, nvcc will generate intermediate files under TEMP
-  # Put them under NVCC_TEMP_DIR instead, then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
-  # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver
-  # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists.
-  if os.path.isfile(NVCC_TEMP_DIR):
-    os.remove(NVCC_TEMP_DIR)
-  if not os.path.exists(NVCC_TEMP_DIR):
-    os.makedirs(NVCC_TEMP_DIR)
-  nvccopts += ['--keep', '--keep-dir', NVCC_TEMP_DIR]
-  cmd = [NVCC_PATH] + nvccopts
-  if log:
-    Log(cmd)
-  proc = subprocess.Popen(
-      cmd,
-      stdout=sys.stdout,
-      stderr=sys.stderr,
-      env=os.environ.copy(),
-      shell=True)
-  proc.wait()
-  return proc.returncode
-
-
-def main():
-  parser = ArgumentParser()
-  parser.add_argument('-x', nargs=1)
-  parser.add_argument('--cuda_log', action='store_true')
-  args, leftover = parser.parse_known_args(sys.argv[1:])
-
-  if args.x and args.x[0] == 'cuda':
-    if args.cuda_log:
-      Log('-x cuda')
-    leftover = [pipes.quote(s) for s in leftover]
-    if args.cuda_log:
-      Log('using nvcc')
-    return InvokeNvcc(leftover, log=args.cuda_log)
-
-  # Strip our flags before passing through to the CPU compiler for files which
-  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
-  # We not only want to pass -x to the CPU compiler, but also keep it in its
-  # relative location in the argv list (the compiler is actually sensitive to
-  # this).
-  cpu_compiler_flags = [
-      flag for flag in sys.argv[1:] if not flag.startswith(('--cuda_log')) and
-      not flag.startswith(('-nvcc_options'))
-  ]
-
-  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
-
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/centos6/gcc7/BUILD b/third_party/toolchains/preconfig/centos6/gcc7/BUILD
deleted file mode 100755
index 5d97f20a7ac..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7/BUILD
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright 2016 The Bazel Authors. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This becomes the BUILD file for @local_config_cc// under non-FreeBSD unixes.
-
-package(default_visibility = ["//visibility:public"])
-
-load(":cc_toolchain_config.bzl", "cc_toolchain_config")
-
-licenses(["notice"])  # Apache 2.0
-
-cc_library(
-    name = "malloc",
-)
-
-filegroup(
-    name = "empty",
-    srcs = [],
-)
-
-filegroup(
-    name = "cc_wrapper",
-    srcs = ["cc_wrapper.sh"],
-)
-
-filegroup(
-    name = "compiler_deps",
-    srcs = glob(["extra_tools/**"]) + [":empty"],
-)
-
-# This is the entry point for --crosstool_top.  Toolchains are found
-# by lopping off the name of --crosstool_top and searching for
-# the "${CPU}" entry in the toolchains attribute.
-cc_toolchain_suite(
-    name = "toolchain",
-    toolchains = {
-        "k8|gcc": ":cc-compiler-k8",
-        "k8": ":cc-compiler-k8",
-        "armeabi-v7a|compiler": ":cc-compiler-armeabi-v7a",
-        "armeabi-v7a": ":cc-compiler-armeabi-v7a",
-    },
-)
-
-cc_toolchain(
-    name = "cc-compiler-k8",
-    all_files = ":compiler_deps",
-    ar_files = ":empty",
-    as_files = ":empty",
-    compiler_files = ":compiler_deps",
-    dwp_files = ":empty",
-    linker_files = ":compiler_deps",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 1,
-    toolchain_config = ":linux_gnu_x86",
-    toolchain_identifier = "linux_gnu_x86",
-)
-
-cc_toolchain_config(
-    name = "linux_gnu_x86",
-    compiler = "gcc",
-    cpu = "k8",
-)
-
-toolchain(
-    name = "cc-toolchain-k8",
-    exec_compatible_with = [
-        # TODO(katre): add autodiscovered constraints for host CPU and OS.
-    ],
-    target_compatible_with = [
-        # TODO(katre): add autodiscovered constraints for host CPU and OS.
-    ],
-    toolchain = ":cc-compiler-k8",
-    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
-)
-
-# Android tooling requires a default toolchain for the armeabi-v7a cpu.
-cc_toolchain(
-    name = "cc-compiler-armeabi-v7a",
-    all_files = ":empty",
-    ar_files = ":empty",
-    as_files = ":empty",
-    compiler_files = ":empty",
-    dwp_files = ":empty",
-    linker_files = ":empty",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 1,
-    toolchain_config = ":stub_armeabi-v7a",
-    toolchain_identifier = "stub_armeabi-v7a",
-)
-
-cc_toolchain_config(
-    name = "stub_armeabi-v7a",
-    compiler = "compiler",
-    cpu = "armeabi-v7a",
-)
-
-toolchain(
-    name = "cc-toolchain-armeabi-v7a",
-    exec_compatible_with = [
-        # TODO(katre): add autodiscovered constraints for host CPU and OS.
-    ],
-    target_compatible_with = [
-        "@bazel_tools//platforms:arm",
-        "@bazel_tools//platforms:android",
-    ],
-    toolchain = ":cc-compiler-armabi-v7a",
-    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
-)
diff --git a/third_party/toolchains/preconfig/centos6/gcc7/WORKSPACE b/third_party/toolchains/preconfig/centos6/gcc7/WORKSPACE
deleted file mode 100644
index bc05b4c36ff..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for cc_autoconf rule
-workspace(name = "local_config_cc")
diff --git a/third_party/toolchains/preconfig/centos6/gcc7/cc_toolchain_config.bzl b/third_party/toolchains/preconfig/centos6/gcc7/cc_toolchain_config.bzl
deleted file mode 100755
index 182957487ae..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7/cc_toolchain_config.bzl
+++ /dev/null
@@ -1,1734 +0,0 @@
-# Copyright 2019 The Bazel Authors. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""A Starlark cc_toolchain configuration rule"""
-
-load(
-    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
-    "action_config",
-    "artifact_name_pattern",
-    "env_entry",
-    "env_set",
-    "feature",
-    "feature_set",
-    "flag_group",
-    "flag_set",
-    "make_variable",  # @unused
-    "tool",
-    "tool_path",
-    "variable_with_value",
-    "with_feature_set",
-)
-load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
-
-all_compile_actions = [
-    ACTION_NAMES.c_compile,
-    ACTION_NAMES.cpp_compile,
-    ACTION_NAMES.linkstamp_compile,
-    ACTION_NAMES.assemble,
-    ACTION_NAMES.preprocess_assemble,
-    ACTION_NAMES.cpp_header_parsing,
-    ACTION_NAMES.cpp_module_compile,
-    ACTION_NAMES.cpp_module_codegen,
-    ACTION_NAMES.clif_match,
-    ACTION_NAMES.lto_backend,
-]
-
-all_cpp_compile_actions = [
-    ACTION_NAMES.cpp_compile,
-    ACTION_NAMES.linkstamp_compile,
-    ACTION_NAMES.cpp_header_parsing,
-    ACTION_NAMES.cpp_module_compile,
-    ACTION_NAMES.cpp_module_codegen,
-    ACTION_NAMES.clif_match,
-]
-
-preprocessor_compile_actions = [
-    ACTION_NAMES.c_compile,
-    ACTION_NAMES.cpp_compile,
-    ACTION_NAMES.linkstamp_compile,
-    ACTION_NAMES.preprocess_assemble,
-    ACTION_NAMES.cpp_header_parsing,
-    ACTION_NAMES.cpp_module_compile,
-    ACTION_NAMES.clif_match,
-]
-
-codegen_compile_actions = [
-    ACTION_NAMES.c_compile,
-    ACTION_NAMES.cpp_compile,
-    ACTION_NAMES.linkstamp_compile,
-    ACTION_NAMES.assemble,
-    ACTION_NAMES.preprocess_assemble,
-    ACTION_NAMES.cpp_module_codegen,
-    ACTION_NAMES.lto_backend,
-]
-
-all_link_actions = [
-    ACTION_NAMES.cpp_link_executable,
-    ACTION_NAMES.cpp_link_dynamic_library,
-    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-]
-
-def _windows_msvc_impl(ctx):
-    toolchain_identifier = "msvc_x64"
-    host_system_name = "local"
-    target_system_name = "local"
-    target_cpu = "x64_windows"
-    target_libc = "msvcrt"
-    compiler = "msvc-cl"
-    abi_version = "local"
-    abi_libc_version = "local"
-    cc_target_os = None
-    builtin_sysroot = None
-
-    cxx_builtin_include_directories = [
-        "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include",
-        "/usr/local/include",
-        "/opt/rh/devtoolset-7/root/usr/include",
-        "/usr/include",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/backward",
-    ]
-
-    cpp_link_nodeps_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "default_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
-        ],
-        tools = [tool(path = "")],
-    )
-
-    cpp_link_static_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_static_library,
-        implies = [
-            "nologo",
-            "archiver_flags",
-            "input_param_flags",
-            "linker_param_file",
-            "msvc_env",
-        ],
-        tools = [tool(path = "")],
-    )
-
-    assemble_action = action_config(
-        action_name = ACTION_NAMES.assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = "")],
-    )
-
-    preprocess_assemble_action = action_config(
-        action_name = ACTION_NAMES.preprocess_assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = "")],
-    )
-
-    c_compile_action = action_config(
-        action_name = ACTION_NAMES.c_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "default_compile_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = "")],
-    )
-
-    cpp_compile_action = action_config(
-        action_name = ACTION_NAMES.cpp_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "default_compile_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = "")],
-    )
-
-    cpp_link_executable_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_executable,
-        implies = [
-            "nologo",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "default_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-        ],
-        tools = [tool(path = "")],
-    )
-
-    cpp_link_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "default_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
-        ],
-        tools = [tool(path = "")],
-    )
-
-    action_configs = [
-        assemble_action,
-        preprocess_assemble_action,
-        c_compile_action,
-        cpp_compile_action,
-        cpp_link_executable_action,
-        cpp_link_dynamic_library_action,
-        cpp_link_nodeps_dynamic_library_action,
-        cpp_link_static_library_action,
-    ]
-
-    msvc_link_env_feature = feature(
-        name = "msvc_link_env",
-        env_sets = [
-            env_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                env_entries = [env_entry(key = "LIB", value = "")],
-            ),
-        ],
-    )
-
-    shared_flag_feature = feature(
-        name = "shared_flag",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [flag_group(flags = ["/DLL"])],
-            ),
-        ],
-    )
-
-    determinism_feature = feature(
-        name = "determinism",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "/wd4117",
-                            "-D__DATE__=\"redacted\"",
-                            "-D__TIMESTAMP__=\"redacted\"",
-                            "-D__TIME__=\"redacted\"",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    sysroot_feature = feature(
-        name = "sysroot",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["--sysroot=%{sysroot}"],
-                        iterate_over = "sysroot",
-                        expand_if_available = "sysroot",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    unfiltered_compile_flags_feature = feature(
-        name = "unfiltered_compile_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{unfiltered_compile_flags}"],
-                        iterate_over = "unfiltered_compile_flags",
-                        expand_if_available = "unfiltered_compile_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
-
-    input_param_flags_feature = feature(
-        name = "input_param_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/IMPLIB:%{interface_library_output_path}"],
-                        expand_if_available = "interface_library_output_path",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{libopts}"],
-                        iterate_over = "libopts",
-                        expand_if_available = "libopts",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        iterate_over = "libraries_to_link",
-                        flag_groups = [
-                            flag_group(
-                                iterate_over = "libraries_to_link.object_files",
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file_group",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "interface_library",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["%{libraries_to_link.name}"],
-                                        expand_if_false = "libraries_to_link.is_whole_archive",
-                                    ),
-                                    flag_group(
-                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
-                                        expand_if_true = "libraries_to_link.is_whole_archive",
-                                    ),
-                                ],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "static_library",
-                                ),
-                            ),
-                        ],
-                        expand_if_available = "libraries_to_link",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    fastbuild_feature = feature(
-        name = "fastbuild",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["", "/INCREMENTAL:NO"],
-                    ),
-                ],
-            ),
-        ],
-        implies = ["generate_pdb_file"],
-    )
-
-    user_compile_flags_feature = feature(
-        name = "user_compile_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_compile_flags}"],
-                        iterate_over = "user_compile_flags",
-                        expand_if_available = "user_compile_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    archiver_flags_feature = feature(
-        name = "archiver_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    default_link_flags_feature = feature(
-        name = "default_link_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
-
-    dynamic_link_msvcrt_debug_feature = feature(
-        name = "dynamic_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MDd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    dbg_feature = feature(
-        name = "dbg",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/Od", "/Z7"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["", "/INCREMENTAL:NO"],
-                    ),
-                ],
-            ),
-        ],
-        implies = ["generate_pdb_file"],
-    )
-
-    opt_feature = feature(
-        name = "opt",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/O2"])],
-            ),
-        ],
-        implies = ["frame_pointer"],
-    )
-
-    supports_interface_shared_libraries_feature = feature(
-        name = "supports_interface_shared_libraries",
-        enabled = True,
-    )
-
-    user_link_flags_feature = feature(
-        name = "user_link_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_link_flags}"],
-                        iterate_over = "user_link_flags",
-                        expand_if_available = "user_link_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    default_compile_flags_feature = feature(
-        name = "default_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "/DCOMPILER_MSVC",
-                            "/DNOMINMAX",
-                            "/D_WIN32_WINNT=0x0601",
-                            "/D_CRT_SECURE_NO_DEPRECATE",
-                            "/D_CRT_SECURE_NO_WARNINGS",
-                            "/bigobj",
-                            "/Zm500",
-                            "/EHsc",
-                            "/wd4351",
-                            "/wd4291",
-                            "/wd4250",
-                            "/wd4996",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    msvc_compile_env_feature = feature(
-        name = "msvc_compile_env",
-        env_sets = [
-            env_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                ],
-                env_entries = [env_entry(key = "INCLUDE", value = "")],
-            ),
-        ],
-    )
-
-    preprocessor_defines_feature = feature(
-        name = "preprocessor_defines",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/D%{preprocessor_defines}"],
-                        iterate_over = "preprocessor_defines",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    generate_pdb_file_feature = feature(
-        name = "generate_pdb_file",
-        requires = [
-            feature_set(features = ["dbg"]),
-            feature_set(features = ["fastbuild"]),
-        ],
-    )
-
-    output_execpath_flags_feature = feature(
-        name = "output_execpath_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_no_debug_feature = feature(
-        name = "dynamic_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MD"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    disable_assertions_feature = feature(
-        name = "disable_assertions",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/DNDEBUG"])],
-                with_features = [with_feature_set(features = ["opt"])],
-            ),
-        ],
-    )
-
-    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
-
-    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
-
-    no_stripping_feature = feature(name = "no_stripping")
-
-    linker_param_file_feature = feature(
-        name = "linker_param_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["@%{linker_param_file}"],
-                        expand_if_available = "linker_param_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    ignore_noisy_warnings_feature = feature(
-        name = "ignore_noisy_warnings",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [flag_group(flags = ["/ignore:4221"])],
-            ),
-        ],
-    )
-
-    no_legacy_features_feature = feature(name = "no_legacy_features")
-
-    parse_showincludes_feature = feature(
-        name = "parse_showincludes",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                ],
-                flag_groups = [flag_group(flags = ["/showIncludes"])],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_no_debug_feature = feature(
-        name = "static_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MT"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    treat_warnings_as_errors_feature = feature(
-        name = "treat_warnings_as_errors",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/WX"])],
-            ),
-        ],
-    )
-
-    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
-
-    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
-
-    include_paths_feature = feature(
-        name = "include_paths",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/I%{quote_include_paths}"],
-                        iterate_over = "quote_include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{include_paths}"],
-                        iterate_over = "include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{system_include_paths}"],
-                        iterate_over = "system_include_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    linkstamps_feature = feature(
-        name = "linkstamps",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{linkstamp_paths}"],
-                        iterate_over = "linkstamp_paths",
-                        expand_if_available = "linkstamp_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    targets_windows_feature = feature(
-        name = "targets_windows",
-        enabled = True,
-        implies = ["copy_dynamic_libraries_to_binary"],
-    )
-
-    linker_subsystem_flag_feature = feature(
-        name = "linker_subsystem_flag",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_debug_feature = feature(
-        name = "static_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MTd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    frame_pointer_feature = feature(
-        name = "frame_pointer",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/Oy-"])],
-            ),
-        ],
-    )
-
-    compiler_output_flags_feature = feature(
-        name = "compiler_output_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.assemble],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}", "/Zi"],
-                                expand_if_available = "output_file",
-                                expand_if_not_available = "output_assembly_file",
-                            ),
-                        ],
-                        expand_if_not_available = "output_preprocess_file",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fa%{output_file}"],
-                                expand_if_available = "output_assembly_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/P", "/Fi%{output_file}"],
-                                expand_if_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    nologo_feature = feature(
-        name = "nologo",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                flag_groups = [flag_group(flags = ["/nologo"])],
-            ),
-        ],
-    )
-
-    smaller_binary_feature = feature(
-        name = "smaller_binary",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/Gy", "/Gw"])],
-                with_features = [with_feature_set(features = ["opt"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/OPT:ICF", "/OPT:REF"])],
-                with_features = [with_feature_set(features = ["opt"])],
-            ),
-        ],
-    )
-
-    compiler_input_flags_feature = feature(
-        name = "compiler_input_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/c", "%{source_file}"],
-                        expand_if_available = "source_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    def_file_feature = feature(
-        name = "def_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
-                        expand_if_available = "def_file_path",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    msvc_env_feature = feature(
-        name = "msvc_env",
-        env_sets = [
-            env_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                env_entries = [
-                    env_entry(key = "PATH", value = ""),
-                    env_entry(key = "TMP", value = ""),
-                    env_entry(key = "TEMP", value = ""),
-                ],
-            ),
-        ],
-        implies = ["msvc_compile_env", "msvc_link_env"],
-    )
-
-    features = [
-        no_legacy_features_feature,
-        nologo_feature,
-        has_configured_linker_path_feature,
-        no_stripping_feature,
-        targets_windows_feature,
-        copy_dynamic_libraries_to_binary_feature,
-        default_compile_flags_feature,
-        msvc_env_feature,
-        msvc_compile_env_feature,
-        msvc_link_env_feature,
-        include_paths_feature,
-        preprocessor_defines_feature,
-        parse_showincludes_feature,
-        generate_pdb_file_feature,
-        shared_flag_feature,
-        linkstamps_feature,
-        output_execpath_flags_feature,
-        archiver_flags_feature,
-        input_param_flags_feature,
-        linker_subsystem_flag_feature,
-        user_link_flags_feature,
-        default_link_flags_feature,
-        linker_param_file_feature,
-        static_link_msvcrt_feature,
-        static_link_msvcrt_no_debug_feature,
-        dynamic_link_msvcrt_no_debug_feature,
-        static_link_msvcrt_debug_feature,
-        dynamic_link_msvcrt_debug_feature,
-        dbg_feature,
-        fastbuild_feature,
-        opt_feature,
-        frame_pointer_feature,
-        disable_assertions_feature,
-        determinism_feature,
-        treat_warnings_as_errors_feature,
-        smaller_binary_feature,
-        ignore_noisy_warnings_feature,
-        user_compile_flags_feature,
-        sysroot_feature,
-        unfiltered_compile_flags_feature,
-        compiler_output_flags_feature,
-        compiler_input_flags_feature,
-        def_file_feature,
-        windows_export_all_symbols_feature,
-        no_windows_export_all_symbols_feature,
-        supports_dynamic_linker_feature,
-        supports_interface_shared_libraries_feature,
-    ]
-
-    artifact_name_patterns = [
-        artifact_name_pattern(
-            category_name = "object_file",
-            prefix = "",
-            extension = ".obj",
-        ),
-        artifact_name_pattern(
-            category_name = "static_library",
-            prefix = "",
-            extension = ".lib",
-        ),
-        artifact_name_pattern(
-            category_name = "alwayslink_static_library",
-            prefix = "",
-            extension = ".lo.lib",
-        ),
-        artifact_name_pattern(
-            category_name = "executable",
-            prefix = "",
-            extension = ".exe",
-        ),
-        artifact_name_pattern(
-            category_name = "dynamic_library",
-            prefix = "",
-            extension = ".dll",
-        ),
-        artifact_name_pattern(
-            category_name = "interface_library",
-            prefix = "",
-            extension = ".if.lib",
-        ),
-    ]
-
-    make_variables = []
-
-    tool_paths = [
-        tool_path(name = "ar", path = ""),
-        tool_path(name = "ml", path = ""),
-        tool_path(name = "cpp", path = ""),
-        tool_path(name = "gcc", path = ""),
-        tool_path(name = "gcov", path = "wrapper/bin/msvc_nop.bat"),
-        tool_path(name = "ld", path = ""),
-        tool_path(name = "nm", path = "wrapper/bin/msvc_nop.bat"),
-        tool_path(
-            name = "objcopy",
-            path = "wrapper/bin/msvc_nop.bat",
-        ),
-        tool_path(
-            name = "objdump",
-            path = "wrapper/bin/msvc_nop.bat",
-        ),
-        tool_path(
-            name = "strip",
-            path = "wrapper/bin/msvc_nop.bat",
-        ),
-    ]
-
-    return cc_common.create_cc_toolchain_config_info(
-        ctx = ctx,
-        features = features,
-        action_configs = action_configs,
-        artifact_name_patterns = artifact_name_patterns,
-        cxx_builtin_include_directories = cxx_builtin_include_directories,
-        toolchain_identifier = toolchain_identifier,
-        host_system_name = host_system_name,
-        target_system_name = target_system_name,
-        target_cpu = target_cpu,
-        target_libc = target_libc,
-        compiler = compiler,
-        abi_version = abi_version,
-        abi_libc_version = abi_libc_version,
-        tool_paths = tool_paths,
-        make_variables = make_variables,
-        builtin_sysroot = builtin_sysroot,
-        cc_target_os = None,
-    )
-
-def _windows_msys_mingw_impl(ctx):
-    toolchain_identifier = "msys_x64_mingw"
-    host_system_name = "local"
-    target_system_name = "local"
-    target_cpu = "x64_windows"
-    target_libc = "mingw"
-    compiler = "mingw-gcc"
-    abi_version = "local"
-    abi_libc_version = "local"
-    cc_target_os = None
-    builtin_sysroot = None
-    action_configs = []
-
-    targets_windows_feature = feature(
-        name = "targets_windows",
-        implies = ["copy_dynamic_libraries_to_binary"],
-        enabled = True,
-    )
-
-    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
-
-    gcc_env_feature = feature(
-        name = "gcc_env",
-        enabled = True,
-        env_sets = [
-            env_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                env_entries = [
-                    env_entry(key = "PATH", value = "NOT_USED"),
-                ],
-            ),
-        ],
-    )
-
-    msys_mingw_flags = [
-    ]
-    msys_mingw_link_flags = [
-    ]
-
-    default_compile_flags_feature = feature(
-        name = "default_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = ([flag_group(flags = msys_mingw_flags)] if msys_mingw_flags else []),
-            ),
-        ],
-    )
-
-    default_link_flags_feature = feature(
-        name = "default_link_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = ([flag_group(flags = msys_mingw_link_flags)] if msys_mingw_link_flags else []),
-            ),
-        ],
-    )
-
-    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
-
-    features = [
-        targets_windows_feature,
-        copy_dynamic_libraries_to_binary_feature,
-        gcc_env_feature,
-        default_compile_flags_feature,
-        default_link_flags_feature,
-        supports_dynamic_linker_feature,
-    ]
-
-    cxx_builtin_include_directories = [
-    ]
-
-    artifact_name_patterns = [
-        artifact_name_pattern(
-            category_name = "executable",
-            prefix = "",
-            extension = ".exe",
-        ),
-    ]
-
-    make_variables = []
-    tool_paths = [
-    ]
-
-    return cc_common.create_cc_toolchain_config_info(
-        ctx = ctx,
-        features = features,
-        action_configs = action_configs,
-        artifact_name_patterns = artifact_name_patterns,
-        cxx_builtin_include_directories = cxx_builtin_include_directories,
-        toolchain_identifier = toolchain_identifier,
-        host_system_name = host_system_name,
-        target_system_name = target_system_name,
-        target_cpu = target_cpu,
-        target_libc = target_libc,
-        compiler = compiler,
-        abi_version = abi_version,
-        abi_libc_version = abi_libc_version,
-        tool_paths = tool_paths,
-        make_variables = make_variables,
-        builtin_sysroot = builtin_sysroot,
-        cc_target_os = cc_target_os,
-    )
-
-def _armeabi_impl(ctx):
-    toolchain_identifier = "stub_armeabi-v7a"
-    host_system_name = "armeabi-v7a"
-    target_system_name = "armeabi-v7a"
-    target_cpu = "armeabi-v7a"
-    target_libc = "armeabi-v7a"
-    compiler = "compiler"
-    abi_version = "armeabi-v7a"
-    abi_libc_version = "armeabi-v7a"
-    cc_target_os = None
-    builtin_sysroot = None
-    action_configs = []
-
-    supports_pic_feature = feature(name = "supports_pic", enabled = True)
-    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
-    features = [supports_dynamic_linker_feature, supports_pic_feature]
-
-    cxx_builtin_include_directories = []
-    artifact_name_patterns = []
-    make_variables = []
-
-    tool_paths = [
-        tool_path(name = "ar", path = "/bin/false"),
-        tool_path(name = "compat-ld", path = "/bin/false"),
-        tool_path(name = "cpp", path = "/bin/false"),
-        tool_path(name = "dwp", path = "/bin/false"),
-        tool_path(name = "gcc", path = "/bin/false"),
-        tool_path(name = "gcov", path = "/bin/false"),
-        tool_path(name = "ld", path = "/bin/false"),
-        tool_path(name = "nm", path = "/bin/false"),
-        tool_path(name = "objcopy", path = "/bin/false"),
-        tool_path(name = "objdump", path = "/bin/false"),
-        tool_path(name = "strip", path = "/bin/false"),
-    ]
-
-    return cc_common.create_cc_toolchain_config_info(
-        ctx = ctx,
-        features = features,
-        action_configs = action_configs,
-        artifact_name_patterns = artifact_name_patterns,
-        cxx_builtin_include_directories = cxx_builtin_include_directories,
-        toolchain_identifier = toolchain_identifier,
-        host_system_name = host_system_name,
-        target_system_name = target_system_name,
-        target_cpu = target_cpu,
-        target_libc = target_libc,
-        compiler = compiler,
-        abi_version = abi_version,
-        abi_libc_version = abi_libc_version,
-        tool_paths = tool_paths,
-        make_variables = make_variables,
-        builtin_sysroot = builtin_sysroot,
-        cc_target_os = cc_target_os,
-    )
-
-def _impl(ctx):
-    if ctx.attr.cpu == "armeabi-v7a":
-        return _armeabi_impl(ctx)
-    elif ctx.attr.cpu == "x64_windows" and ctx.attr.compiler == "msvc-cl":
-        return _windows_msvc_impl(ctx)
-    elif ctx.attr.cpu == "x64_windows" and ctx.attr.compiler == "mingw-gcc":
-        return _windows_msys_mingw_impl(ctx)
-
-    tool_paths = [
-        tool_path(name = "ar", path = "/opt/rh/devtoolset-7/root/usr/bin/ar"),
-        tool_path(name = "ld", path = "/opt/rh/devtoolset-7/root/usr/bin/ld"),
-        tool_path(name = "cpp", path = "/opt/rh/devtoolset-7/root/usr/bin/cpp"),
-        tool_path(name = "gcc", path = "/opt/rh/devtoolset-7/root/usr/bin/gcc"),
-        tool_path(name = "dwp", path = "/opt/rh/devtoolset-7/root/usr/bin/dwp"),
-        tool_path(name = "gcov", path = "/opt/rh/devtoolset-7/root/usr/bin/gcov"),
-        tool_path(name = "nm", path = "/opt/rh/devtoolset-7/root/usr/bin/nm"),
-        tool_path(name = "objcopy", path = "/opt/rh/devtoolset-7/root/usr/bin/objcopy"),
-        tool_path(name = "objdump", path = "/opt/rh/devtoolset-7/root/usr/bin/objdump"),
-        tool_path(name = "strip", path = "/opt/rh/devtoolset-7/root/usr/bin/strip"),
-    ]
-
-    cxx_builtin_include_directories = [
-        "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include",
-        "/usr/local/include",
-        "/opt/rh/devtoolset-7/root/usr/include",
-        "/usr/include",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux",
-        "/opt/rh/devtoolset-7/root/usr/include/c++/7/backward",
-    ]
-
-    action_configs = []
-
-    compile_flags = [
-        "-U_FORTIFY_SOURCE",
-        "-fstack-protector",
-        "-Wall",
-        "-Wunused-but-set-parameter",
-        "-Wno-free-nonheap-object",
-        "-fno-omit-frame-pointer",
-    ]
-
-    dbg_compile_flags = [
-        "-g",
-    ]
-
-    opt_compile_flags = [
-        "-g0",
-        "-O2",
-        "-D_FORTIFY_SOURCE=1",
-        "-DNDEBUG",
-        "-ffunction-sections",
-        "-fdata-sections",
-    ]
-
-    cxx_flags = [
-        "-std=c++0x",
-    ]
-
-    link_flags = [
-        "-fuse-ld=gold",
-        "-Wl,-no-as-needed",
-        "-Wl,-z,relro,-z,now",
-        "-B/opt/rh/devtoolset-7/root/usr/bin",
-        "-pass-exit-codes",
-        "-lstdc++",
-        "-lm",
-    ]
-
-    opt_link_flags = [
-        "-Wl,--gc-sections",
-    ]
-
-    unfiltered_compile_flags = [
-        "-fno-canonical-system-headers",
-        "-Wno-builtin-macro-redefined",
-        "-D__DATE__=\"redacted\"",
-        "-D__TIMESTAMP__=\"redacted\"",
-        "-D__TIME__=\"redacted\"",
-    ]
-
-    targets_windows_feature = feature(
-        name = "targets_windows",
-        implies = ["copy_dynamic_libraries_to_binary"],
-        enabled = True,
-    )
-
-    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
-
-    gcc_env_feature = feature(
-        name = "gcc_env",
-        enabled = True,
-        env_sets = [
-            env_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                env_entries = [
-                    env_entry(key = "PATH", value = "NOT_USED"),
-                ],
-            ),
-        ],
-    )
-
-    windows_features = [
-        targets_windows_feature,
-        copy_dynamic_libraries_to_binary_feature,
-        gcc_env_feature,
-    ]
-
-    coverage_feature = feature(
-        name = "coverage",
-        provides = ["profile"],
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(flags = ["--coverage"]),
-                ],
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [
-                    flag_group(flags = ["--coverage"]),
-                ],
-            ),
-        ],
-    )
-
-    supports_pic_feature = feature(
-        name = "supports_pic",
-        enabled = True,
-    )
-    supports_start_end_lib_feature = feature(
-        name = "supports_start_end_lib",
-        enabled = True,
-    )
-
-    default_compile_flags_feature = feature(
-        name = "default_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = ([flag_group(flags = compile_flags)] if compile_flags else []),
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = ([flag_group(flags = dbg_compile_flags)] if dbg_compile_flags else []),
-                with_features = [with_feature_set(features = ["dbg"])],
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = ([flag_group(flags = opt_compile_flags)] if opt_compile_flags else []),
-                with_features = [with_feature_set(features = ["opt"])],
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = ([flag_group(flags = cxx_flags)] if cxx_flags else []),
-            ),
-        ],
-    )
-
-    default_link_flags_feature = feature(
-        name = "default_link_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = ([flag_group(flags = link_flags)] if link_flags else []),
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = ([flag_group(flags = opt_link_flags)] if opt_link_flags else []),
-                with_features = [with_feature_set(features = ["opt"])],
-            ),
-        ],
-    )
-
-    dbg_feature = feature(name = "dbg")
-
-    opt_feature = feature(name = "opt")
-
-    sysroot_feature = feature(
-        name = "sysroot",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["--sysroot=%{sysroot}"],
-                        expand_if_available = "sysroot",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    fdo_optimize_feature = feature(
-        name = "fdo_optimize",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-fprofile-use=%{fdo_profile_path}",
-                            "-fprofile-correction",
-                        ],
-                        expand_if_available = "fdo_profile_path",
-                    ),
-                ],
-            ),
-        ],
-        provides = ["profile"],
-    )
-
-    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
-
-    user_compile_flags_feature = feature(
-        name = "user_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_compile_flags}"],
-                        iterate_over = "user_compile_flags",
-                        expand_if_available = "user_compile_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    unfiltered_compile_flags_feature = feature(
-        name = "unfiltered_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = ([flag_group(flags = unfiltered_compile_flags)] if unfiltered_compile_flags else []),
-            ),
-        ],
-    )
-
-    features = [
-        supports_pic_feature,
-        supports_start_end_lib_feature,
-        coverage_feature,
-        default_compile_flags_feature,
-        default_link_flags_feature,
-        fdo_optimize_feature,
-        supports_dynamic_linker_feature,
-        dbg_feature,
-        opt_feature,
-        user_compile_flags_feature,
-        sysroot_feature,
-        unfiltered_compile_flags_feature,
-    ]
-
-    artifact_name_patterns = [
-    ]
-
-    make_variables = []
-
-    return cc_common.create_cc_toolchain_config_info(
-        ctx = ctx,
-        features = features,
-        action_configs = action_configs,
-        artifact_name_patterns = artifact_name_patterns,
-        cxx_builtin_include_directories = cxx_builtin_include_directories,
-        toolchain_identifier = "linux_gnu_x86",
-        host_system_name = "i686-unknown-linux-gnu",
-        target_system_name = "x86_64-unknown-linux-gnu",
-        target_cpu = "k8",
-        target_libc = "glibc_2.19",
-        compiler = "gcc",
-        abi_version = "gcc",
-        abi_libc_version = "glibc_2.19",
-        tool_paths = tool_paths,
-        make_variables = make_variables,
-        builtin_sysroot = "",
-        cc_target_os = None,
-    )
-
-cc_toolchain_config = rule(
-    implementation = _impl,
-    attrs = {
-        "cpu": attr.string(mandatory = True),
-        "compiler": attr.string(),
-    },
-    provides = [CcToolchainConfigInfo],
-)
diff --git a/third_party/toolchains/preconfig/centos6/gcc7/cc_wrapper.sh b/third_party/toolchains/preconfig/centos6/gcc7/cc_wrapper.sh
deleted file mode 100755
index 5a5465cc968..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7/cc_wrapper.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-#
-# Copyright 2015 The Bazel Authors. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Ship the environment to the C++ action
-#
-set -eu
-
-# Set-up the environment
-
-
-# Call the C++ compiler
-/opt/rh/devtoolset-7/root/usr/bin/gcc "$@"
diff --git a/third_party/toolchains/preconfig/centos6/gcc7/dummy_toolchain.bzl b/third_party/toolchains/preconfig/centos6/gcc7/dummy_toolchain.bzl
deleted file mode 100755
index 45c0285d232..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7/dummy_toolchain.bzl
+++ /dev/null
@@ -1,23 +0,0 @@
-# pylint: disable=g-bad-file-header
-# Copyright 2017 The Bazel Authors. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Skylark rule that stubs a toolchain."""
-
-def _dummy_toolchain_impl(ctx):
-    ctx = ctx  # unused argument
-    toolchain = platform_common.ToolchainInfo()
-    return [toolchain]
-
-dummy_toolchain = rule(_dummy_toolchain_impl, attrs = {})
diff --git a/third_party/toolchains/preconfig/centos6/gcc7/tools/cpp/empty.cc b/third_party/toolchains/preconfig/centos6/gcc7/tools/cpp/empty.cc
deleted file mode 100755
index 237c8ce1817..00000000000
--- a/third_party/toolchains/preconfig/centos6/gcc7/tools/cpp/empty.cc
+++ /dev/null
@@ -1 +0,0 @@
-int main() {}
diff --git a/third_party/toolchains/preconfig/centos6/py/BUILD b/third_party/toolchains/preconfig/centos6/py/BUILD
deleted file mode 100755
index b8de94c9e2c..00000000000
--- a/third_party/toolchains/preconfig/centos6/py/BUILD
+++ /dev/null
@@ -1,174 +0,0 @@
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
-# See https://docs.python.org/3/extending/windows.html
-cc_import(
-    name = "python_lib",
-    interface_library = select({
-        ":windows": ":python_import_lib",
-        # A placeholder for Unix platforms which makes --no_build happy.
-        "//conditions:default": "not-existing.lib",
-    }),
-    system_provided = 1,
-)
-
-cc_library(
-    name = "python_headers",
-    hdrs = [":python_include"],
-    includes = ["python_include"],
-    deps = select({
-        ":windows": [":python_lib"],
-        "//conditions:default": [],
-    }),
-)
-
-cc_library(
-    name = "numpy_headers",
-    hdrs = [":numpy_include"],
-    includes = ["numpy_include"],
-)
-
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-genrule(
-    name = "python_include",
-    outs = [
-        "python_include/Python-ast.h",
-        "python_include/Python.h",
-        "python_include/abstract.h",
-        "python_include/asdl.h",
-        "python_include/ast.h",
-        "python_include/bitset.h",
-        "python_include/boolobject.h",
-        "python_include/bufferobject.h",
-        "python_include/bytearrayobject.h",
-        "python_include/bytes_methods.h",
-        "python_include/bytesobject.h",
-        "python_include/cStringIO.h",
-        "python_include/cellobject.h",
-        "python_include/ceval.h",
-        "python_include/classobject.h",
-        "python_include/cobject.h",
-        "python_include/code.h",
-        "python_include/codecs.h",
-        "python_include/compile.h",
-        "python_include/complexobject.h",
-        "python_include/datetime.h",
-        "python_include/descrobject.h",
-        "python_include/dictobject.h",
-        "python_include/dtoa.h",
-        "python_include/enumobject.h",
-        "python_include/errcode.h",
-        "python_include/eval.h",
-        "python_include/fileobject.h",
-        "python_include/floatobject.h",
-        "python_include/frameobject.h",
-        "python_include/funcobject.h",
-        "python_include/genobject.h",
-        "python_include/graminit.h",
-        "python_include/grammar.h",
-        "python_include/import.h",
-        "python_include/intobject.h",
-        "python_include/intrcheck.h",
-        "python_include/iterobject.h",
-        "python_include/listobject.h",
-        "python_include/longintrepr.h",
-        "python_include/longobject.h",
-        "python_include/marshal.h",
-        "python_include/memoryobject.h",
-        "python_include/metagrammar.h",
-        "python_include/methodobject.h",
-        "python_include/modsupport.h",
-        "python_include/moduleobject.h",
-        "python_include/node.h",
-        "python_include/object.h",
-        "python_include/objimpl.h",
-        "python_include/opcode.h",
-        "python_include/osdefs.h",
-        "python_include/parsetok.h",
-        "python_include/patchlevel.h",
-        "python_include/pgen.h",
-        "python_include/pgenheaders.h",
-        "python_include/py_curses.h",
-        "python_include/pyarena.h",
-        "python_include/pycapsule.h",
-        "python_include/pyconfig-64.h",
-        "python_include/pyconfig.h",
-        "python_include/pyctype.h",
-        "python_include/pydebug.h",
-        "python_include/pyerrors.h",
-        "python_include/pyexpat.h",
-        "python_include/pyfpe.h",
-        "python_include/pygetopt.h",
-        "python_include/pymacconfig.h",
-        "python_include/pymactoolbox.h",
-        "python_include/pymath.h",
-        "python_include/pymem.h",
-        "python_include/pyport.h",
-        "python_include/pystate.h",
-        "python_include/pystrcmp.h",
-        "python_include/pystrtod.h",
-        "python_include/pythonrun.h",
-        "python_include/pythread.h",
-        "python_include/rangeobject.h",
-        "python_include/setobject.h",
-        "python_include/sliceobject.h",
-        "python_include/stringobject.h",
-        "python_include/structmember.h",
-        "python_include/structseq.h",
-        "python_include/symtable.h",
-        "python_include/sysmodule.h",
-        "python_include/timefuncs.h",
-        "python_include/token.h",
-        "python_include/traceback.h",
-        "python_include/tupleobject.h",
-        "python_include/ucnhash.h",
-        "python_include/unicodeobject.h",
-        "python_include/warnings.h",
-        "python_include/weakrefobject.h",
-    ],
-    cmd = """
-cp -f "/opt/rh/python27/root/usr/include/python2.7/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/Python.h" "$(@D)/python_include/Python.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/abstract.h" "$(@D)/python_include/abstract.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/asdl.h" "$(@D)/python_include/asdl.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/ast.h" "$(@D)/python_include/ast.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/bitset.h" "$(@D)/python_include/bitset.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/boolobject.h" "$(@D)/python_include/boolobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/bufferobject.h" "$(@D)/python_include/bufferobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/cStringIO.h" "$(@D)/python_include/cStringIO.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/cellobject.h" "$(@D)/python_include/cellobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/ceval.h" "$(@D)/python_include/ceval.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/classobject.h" "$(@D)/python_include/classobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/cobject.h" "$(@D)/python_include/cobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/code.h" "$(@D)/python_include/code.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/codecs.h" "$(@D)/python_include/codecs.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/compile.h" "$(@D)/python_include/compile.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/complexobject.h" "$(@D)/python_include/complexobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/datetime.h" "$(@D)/python_include/datetime.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/descrobject.h" "$(@D)/python_include/descrobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/dictobject.h" "$(@D)/python_include/dictobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/dtoa.h" "$(@D)/python_include/dtoa.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/enumobject.h" "$(@D)/python_include/enumobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/errcode.h" "$(@D)/python_include/errcode.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/eval.h" "$(@D)/python_include/eval.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/fileobject.h" "$(@D)/python_include/fileobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/floatobject.h" "$(@D)/python_include/floatobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/frameobject.h" "$(@D)/python_include/frameobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/funcobject.h" "$(@D)/python_include/funcobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/genobject.h" "$(@D)/python_include/genobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/graminit.h" "$(@D)/python_include/graminit.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/grammar.h" "$(@D)/python_include/grammar.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/import.h" "$(@D)/python_include/import.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/intobject.h" "$(@D)/python_include/intobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/iterobject.h" "$(@D)/python_include/iterobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/listobject.h" "$(@D)/python_include/listobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/longobject.h" "$(@D)/python_include/longobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/marshal.h" "$(@D)/python_include/marshal.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/methodobject.h" "$(@D)/python_include/methodobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/modsupport.h" "$(@D)/python_include/modsupport.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/node.h" "$(@D)/python_include/node.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/object.h" "$(@D)/python_include/object.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/objimpl.h" "$(@D)/python_include/objimpl.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/opcode.h" "$(@D)/python_include/opcode.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/osdefs.h" "$(@D)/python_include/osdefs.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/parsetok.h" "$(@D)/python_include/parsetok.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pgen.h" "$(@D)/python_include/pgen.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/py_curses.h" "$(@D)/python_include/py_curses.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyarena.h" "$(@D)/python_include/pyarena.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyconfig-64.h" "$(@D)/python_include/pyconfig-64.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyctype.h" "$(@D)/python_include/pyctype.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pydebug.h" "$(@D)/python_include/pydebug.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pymactoolbox.h" "$(@D)/python_include/pymactoolbox.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pymath.h" "$(@D)/python_include/pymath.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pymem.h" "$(@D)/python_include/pymem.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pyport.h" "$(@D)/python_include/pyport.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pystate.h" "$(@D)/python_include/pystate.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/pythread.h" "$(@D)/python_include/pythread.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/setobject.h" "$(@D)/python_include/setobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/stringobject.h" "$(@D)/python_include/stringobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/structmember.h" "$(@D)/python_include/structmember.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/structseq.h" "$(@D)/python_include/structseq.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/symtable.h" "$(@D)/python_include/symtable.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/timefuncs.h" "$(@D)/python_include/timefuncs.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/token.h" "$(@D)/python_include/token.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/traceback.h" "$(@D)/python_include/traceback.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/warnings.h" "$(@D)/python_include/warnings.h" && cp -f "/opt/rh/python27/root/usr/include/python2.7/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
-   """,
-)
-
-genrule(
-    name = "numpy_include",
-    outs = [
-        "numpy_include/numpy/__multiarray_api.h",
-        "numpy_include/numpy/__ufunc_api.h",
-        "numpy_include/numpy/_neighborhood_iterator_imp.h",
-        "numpy_include/numpy/_numpyconfig.h",
-        "numpy_include/numpy/arrayobject.h",
-        "numpy_include/numpy/arrayscalars.h",
-        "numpy_include/numpy/halffloat.h",
-        "numpy_include/numpy/multiarray_api.txt",
-        "numpy_include/numpy/ndarrayobject.h",
-        "numpy_include/numpy/ndarraytypes.h",
-        "numpy_include/numpy/noprefix.h",
-        "numpy_include/numpy/npy_1_7_deprecated_api.h",
-        "numpy_include/numpy/npy_3kcompat.h",
-        "numpy_include/numpy/npy_common.h",
-        "numpy_include/numpy/npy_cpu.h",
-        "numpy_include/numpy/npy_endian.h",
-        "numpy_include/numpy/npy_interrupt.h",
-        "numpy_include/numpy/npy_math.h",
-        "numpy_include/numpy/npy_no_deprecated_api.h",
-        "numpy_include/numpy/npy_os.h",
-        "numpy_include/numpy/numpyconfig.h",
-        "numpy_include/numpy/old_defines.h",
-        "numpy_include/numpy/oldnumeric.h",
-        "numpy_include/numpy/ufunc_api.txt",
-        "numpy_include/numpy/ufuncobject.h",
-        "numpy_include/numpy/utils.h",
-    ],
-    cmd = """
-cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp -f "/opt/rh/python27/root/usr/lib64/python2.7/site-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
-   """,
-)
diff --git a/third_party/toolchains/preconfig/centos6/py/WORKSPACE b/third_party/toolchains/preconfig/centos6/py/WORKSPACE
deleted file mode 100644
index 1d298fefa3b..00000000000
--- a/third_party/toolchains/preconfig/centos6/py/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for python_configure rule
-workspace(name = "local_config_python")
diff --git a/third_party/toolchains/preconfig/centos6/py3/BUILD b/third_party/toolchains/preconfig/centos6/py3/BUILD
deleted file mode 100755
index ac17c471427..00000000000
--- a/third_party/toolchains/preconfig/centos6/py3/BUILD
+++ /dev/null
@@ -1,181 +0,0 @@
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
-# See https://docs.python.org/3/extending/windows.html
-cc_import(
-    name = "python_lib",
-    interface_library = select({
-        ":windows": ":python_import_lib",
-        # A placeholder for Unix platforms which makes --no_build happy.
-        "//conditions:default": "not-existing.lib",
-    }),
-    system_provided = 1,
-)
-
-cc_library(
-    name = "python_headers",
-    hdrs = [":python_include"],
-    includes = ["python_include"],
-    deps = select({
-        ":windows": [":python_lib"],
-        "//conditions:default": [],
-    }),
-)
-
-cc_library(
-    name = "numpy_headers",
-    hdrs = [":numpy_include"],
-    includes = ["numpy_include"],
-)
-
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-genrule(
-    name = "python_include",
-    outs = [
-        "python_include/Python-ast.h",
-        "python_include/Python.h",
-        "python_include/abstract.h",
-        "python_include/accu.h",
-        "python_include/asdl.h",
-        "python_include/ast.h",
-        "python_include/bitset.h",
-        "python_include/bltinmodule.h",
-        "python_include/boolobject.h",
-        "python_include/bytearrayobject.h",
-        "python_include/bytes_methods.h",
-        "python_include/bytesobject.h",
-        "python_include/cellobject.h",
-        "python_include/ceval.h",
-        "python_include/classobject.h",
-        "python_include/code.h",
-        "python_include/codecs.h",
-        "python_include/compile.h",
-        "python_include/complexobject.h",
-        "python_include/datetime.h",
-        "python_include/descrobject.h",
-        "python_include/dictobject.h",
-        "python_include/dtoa.h",
-        "python_include/dynamic_annotations.h",
-        "python_include/enumobject.h",
-        "python_include/errcode.h",
-        "python_include/eval.h",
-        "python_include/fileobject.h",
-        "python_include/fileutils.h",
-        "python_include/floatobject.h",
-        "python_include/frameobject.h",
-        "python_include/funcobject.h",
-        "python_include/genobject.h",
-        "python_include/graminit.h",
-        "python_include/grammar.h",
-        "python_include/import.h",
-        "python_include/intrcheck.h",
-        "python_include/iterobject.h",
-        "python_include/listobject.h",
-        "python_include/longintrepr.h",
-        "python_include/longobject.h",
-        "python_include/marshal.h",
-        "python_include/memoryobject.h",
-        "python_include/metagrammar.h",
-        "python_include/methodobject.h",
-        "python_include/modsupport.h",
-        "python_include/moduleobject.h",
-        "python_include/namespaceobject.h",
-        "python_include/node.h",
-        "python_include/object.h",
-        "python_include/objimpl.h",
-        "python_include/odictobject.h",
-        "python_include/opcode.h",
-        "python_include/osdefs.h",
-        "python_include/osmodule.h",
-        "python_include/parsetok.h",
-        "python_include/patchlevel.h",
-        "python_include/pgen.h",
-        "python_include/pgenheaders.h",
-        "python_include/py_curses.h",
-        "python_include/pyarena.h",
-        "python_include/pyatomic.h",
-        "python_include/pycapsule.h",
-        "python_include/pyconfig.h",
-        "python_include/pyctype.h",
-        "python_include/pydebug.h",
-        "python_include/pydtrace.h",
-        "python_include/pyerrors.h",
-        "python_include/pyexpat.h",
-        "python_include/pyfpe.h",
-        "python_include/pygetopt.h",
-        "python_include/pyhash.h",
-        "python_include/pylifecycle.h",
-        "python_include/pymacconfig.h",
-        "python_include/pymacro.h",
-        "python_include/pymath.h",
-        "python_include/pymem.h",
-        "python_include/pyport.h",
-        "python_include/pystate.h",
-        "python_include/pystrcmp.h",
-        "python_include/pystrhex.h",
-        "python_include/pystrtod.h",
-        "python_include/pythonrun.h",
-        "python_include/pythread.h",
-        "python_include/pytime.h",
-        "python_include/rangeobject.h",
-        "python_include/setobject.h",
-        "python_include/sliceobject.h",
-        "python_include/structmember.h",
-        "python_include/structseq.h",
-        "python_include/symtable.h",
-        "python_include/sysmodule.h",
-        "python_include/token.h",
-        "python_include/traceback.h",
-        "python_include/tupleobject.h",
-        "python_include/typeslots.h",
-        "python_include/ucnhash.h",
-        "python_include/unicodeobject.h",
-        "python_include/warnings.h",
-        "python_include/weakrefobject.h",
-    ],
-    cmd = """
-cp -f "/usr/local/include/python3.6m/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp -f "/usr/local/include/python3.6m/Python.h" "$(@D)/python_include/Python.h" && cp -f "/usr/local/include/python3.6m/abstract.h" "$(@D)/python_include/abstract.h" && cp -f "/usr/local/include/python3.6m/accu.h" "$(@D)/python_include/accu.h" && cp -f "/usr/local/include/python3.6m/asdl.h" "$(@D)/python_include/asdl.h" && cp -f "/usr/local/include/python3.6m/ast.h" "$(@D)/python_include/ast.h" && cp -f "/usr/local/include/python3.6m/bitset.h" "$(@D)/python_include/bitset.h" && cp -f "/usr/local/include/python3.6m/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp -f "/usr/local/include/python3.6m/boolobject.h" "$(@D)/python_include/boolobject.h" && cp -f "/usr/local/include/python3.6m/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp -f "/usr/local/include/python3.6m/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp -f "/usr/local/include/python3.6m/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp -f "/usr/local/include/python3.6m/cellobject.h" "$(@D)/python_include/cellobject.h" && cp -f "/usr/local/include/python3.6m/ceval.h" "$(@D)/python_include/ceval.h" && cp -f "/usr/local/include/python3.6m/classobject.h" "$(@D)/python_include/classobject.h" && cp -f "/usr/local/include/python3.6m/code.h" "$(@D)/python_include/code.h" && cp -f "/usr/local/include/python3.6m/codecs.h" "$(@D)/python_include/codecs.h" && cp -f "/usr/local/include/python3.6m/compile.h" "$(@D)/python_include/compile.h" && cp -f "/usr/local/include/python3.6m/complexobject.h" "$(@D)/python_include/complexobject.h" && cp -f "/usr/local/include/python3.6m/datetime.h" "$(@D)/python_include/datetime.h" && cp -f "/usr/local/include/python3.6m/descrobject.h" "$(@D)/python_include/descrobject.h" && cp -f "/usr/local/include/python3.6m/dictobject.h" "$(@D)/python_include/dictobject.h" && cp -f "/usr/local/include/python3.6m/dtoa.h" "$(@D)/python_include/dtoa.h" && cp -f "/usr/local/include/python3.6m/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp -f "/usr/local/include/python3.6m/enumobject.h" "$(@D)/python_include/enumobject.h" && cp -f "/usr/local/include/python3.6m/errcode.h" "$(@D)/python_include/errcode.h" && cp -f "/usr/local/include/python3.6m/eval.h" "$(@D)/python_include/eval.h" && cp -f "/usr/local/include/python3.6m/fileobject.h" "$(@D)/python_include/fileobject.h" && cp -f "/usr/local/include/python3.6m/fileutils.h" "$(@D)/python_include/fileutils.h" && cp -f "/usr/local/include/python3.6m/floatobject.h" "$(@D)/python_include/floatobject.h" && cp -f "/usr/local/include/python3.6m/frameobject.h" "$(@D)/python_include/frameobject.h" && cp -f "/usr/local/include/python3.6m/funcobject.h" "$(@D)/python_include/funcobject.h" && cp -f "/usr/local/include/python3.6m/genobject.h" "$(@D)/python_include/genobject.h" && cp -f "/usr/local/include/python3.6m/graminit.h" "$(@D)/python_include/graminit.h" && cp -f "/usr/local/include/python3.6m/grammar.h" "$(@D)/python_include/grammar.h" && cp -f "/usr/local/include/python3.6m/import.h" "$(@D)/python_include/import.h" && cp -f "/usr/local/include/python3.6m/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp -f "/usr/local/include/python3.6m/iterobject.h" "$(@D)/python_include/iterobject.h" && cp -f "/usr/local/include/python3.6m/listobject.h" "$(@D)/python_include/listobject.h" && cp -f "/usr/local/include/python3.6m/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp -f "/usr/local/include/python3.6m/longobject.h" "$(@D)/python_include/longobject.h" && cp -f "/usr/local/include/python3.6m/marshal.h" "$(@D)/python_include/marshal.h" && cp -f "/usr/local/include/python3.6m/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp -f "/usr/local/include/python3.6m/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp -f "/usr/local/include/python3.6m/methodobject.h" "$(@D)/python_include/methodobject.h" && cp -f "/usr/local/include/python3.6m/modsupport.h" "$(@D)/python_include/modsupport.h" && cp -f "/usr/local/include/python3.6m/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp -f "/usr/local/include/python3.6m/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp -f "/usr/local/include/python3.6m/node.h" "$(@D)/python_include/node.h" && cp -f "/usr/local/include/python3.6m/object.h" "$(@D)/python_include/object.h" && cp -f "/usr/local/include/python3.6m/objimpl.h" "$(@D)/python_include/objimpl.h" && cp -f "/usr/local/include/python3.6m/odictobject.h" "$(@D)/python_include/odictobject.h" && cp -f "/usr/local/include/python3.6m/opcode.h" "$(@D)/python_include/opcode.h" && cp -f "/usr/local/include/python3.6m/osdefs.h" "$(@D)/python_include/osdefs.h" && cp -f "/usr/local/include/python3.6m/osmodule.h" "$(@D)/python_include/osmodule.h" && cp -f "/usr/local/include/python3.6m/parsetok.h" "$(@D)/python_include/parsetok.h" && cp -f "/usr/local/include/python3.6m/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp -f "/usr/local/include/python3.6m/pgen.h" "$(@D)/python_include/pgen.h" && cp -f "/usr/local/include/python3.6m/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp -f "/usr/local/include/python3.6m/py_curses.h" "$(@D)/python_include/py_curses.h" && cp -f "/usr/local/include/python3.6m/pyarena.h" "$(@D)/python_include/pyarena.h" && cp -f "/usr/local/include/python3.6m/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp -f "/usr/local/include/python3.6m/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp -f "/usr/local/include/python3.6m/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp -f "/usr/local/include/python3.6m/pyctype.h" "$(@D)/python_include/pyctype.h" && cp -f "/usr/local/include/python3.6m/pydebug.h" "$(@D)/python_include/pydebug.h" && cp -f "/usr/local/include/python3.6m/pydtrace.h" "$(@D)/python_include/pydtrace.h" && cp -f "/usr/local/include/python3.6m/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp -f "/usr/local/include/python3.6m/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp -f "/usr/local/include/python3.6m/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp -f "/usr/local/include/python3.6m/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp -f "/usr/local/include/python3.6m/pyhash.h" "$(@D)/python_include/pyhash.h" && cp -f "/usr/local/include/python3.6m/pylifecycle.h" "$(@D)/python_include/pylifecycle.h" && cp -f "/usr/local/include/python3.6m/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp -f "/usr/local/include/python3.6m/pymacro.h" "$(@D)/python_include/pymacro.h" && cp -f "/usr/local/include/python3.6m/pymath.h" "$(@D)/python_include/pymath.h" && cp -f "/usr/local/include/python3.6m/pymem.h" "$(@D)/python_include/pymem.h" && cp -f "/usr/local/include/python3.6m/pyport.h" "$(@D)/python_include/pyport.h" && cp -f "/usr/local/include/python3.6m/pystate.h" "$(@D)/python_include/pystate.h" && cp -f "/usr/local/include/python3.6m/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp -f "/usr/local/include/python3.6m/pystrhex.h" "$(@D)/python_include/pystrhex.h" && cp -f "/usr/local/include/python3.6m/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp -f "/usr/local/include/python3.6m/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp -f "/usr/local/include/python3.6m/pythread.h" "$(@D)/python_include/pythread.h" && cp -f "/usr/local/include/python3.6m/pytime.h" "$(@D)/python_include/pytime.h" && cp -f "/usr/local/include/python3.6m/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp -f "/usr/local/include/python3.6m/setobject.h" "$(@D)/python_include/setobject.h" && cp -f "/usr/local/include/python3.6m/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp -f "/usr/local/include/python3.6m/structmember.h" "$(@D)/python_include/structmember.h" && cp -f "/usr/local/include/python3.6m/structseq.h" "$(@D)/python_include/structseq.h" && cp -f "/usr/local/include/python3.6m/symtable.h" "$(@D)/python_include/symtable.h" && cp -f "/usr/local/include/python3.6m/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp -f "/usr/local/include/python3.6m/token.h" "$(@D)/python_include/token.h" && cp -f "/usr/local/include/python3.6m/traceback.h" "$(@D)/python_include/traceback.h" && cp -f "/usr/local/include/python3.6m/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp -f "/usr/local/include/python3.6m/typeslots.h" "$(@D)/python_include/typeslots.h" && cp -f "/usr/local/include/python3.6m/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp -f "/usr/local/include/python3.6m/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp -f "/usr/local/include/python3.6m/warnings.h" "$(@D)/python_include/warnings.h" && cp -f "/usr/local/include/python3.6m/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
-   """,
-)
-
-genrule(
-    name = "numpy_include",
-    outs = [
-        "numpy_include/numpy/__multiarray_api.h",
-        "numpy_include/numpy/__ufunc_api.h",
-        "numpy_include/numpy/_neighborhood_iterator_imp.h",
-        "numpy_include/numpy/_numpyconfig.h",
-        "numpy_include/numpy/arrayobject.h",
-        "numpy_include/numpy/arrayscalars.h",
-        "numpy_include/numpy/halffloat.h",
-        "numpy_include/numpy/multiarray_api.txt",
-        "numpy_include/numpy/ndarrayobject.h",
-        "numpy_include/numpy/ndarraytypes.h",
-        "numpy_include/numpy/noprefix.h",
-        "numpy_include/numpy/npy_1_7_deprecated_api.h",
-        "numpy_include/numpy/npy_3kcompat.h",
-        "numpy_include/numpy/npy_common.h",
-        "numpy_include/numpy/npy_cpu.h",
-        "numpy_include/numpy/npy_endian.h",
-        "numpy_include/numpy/npy_interrupt.h",
-        "numpy_include/numpy/npy_math.h",
-        "numpy_include/numpy/npy_no_deprecated_api.h",
-        "numpy_include/numpy/npy_os.h",
-        "numpy_include/numpy/numpyconfig.h",
-        "numpy_include/numpy/old_defines.h",
-        "numpy_include/numpy/oldnumeric.h",
-        "numpy_include/numpy/ufunc_api.txt",
-        "numpy_include/numpy/ufuncobject.h",
-        "numpy_include/numpy/utils.h",
-    ],
-    cmd = """
-cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp -f "/usr/local/lib/python3.6/site-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
-   """,
-)
diff --git a/third_party/toolchains/preconfig/centos6/py3/WORKSPACE b/third_party/toolchains/preconfig/centos6/py3/WORKSPACE
deleted file mode 100644
index 1d298fefa3b..00000000000
--- a/third_party/toolchains/preconfig/centos6/py3/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for python_configure rule
-workspace(name = "local_config_python")
diff --git a/third_party/toolchains/preconfig/centos6/tensorrt5/BUILD b/third_party/toolchains/preconfig/centos6/tensorrt5/BUILD
deleted file mode 100755
index 3e96ceb3324..00000000000
--- a/third_party/toolchains/preconfig/centos6/tensorrt5/BUILD
+++ /dev/null
@@ -1,62 +0,0 @@
-# NVIDIA TensorRT
-# A high-performance deep learning inference optimizer and runtime.
-
-licenses(["notice"])
-
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
-
-package(default_visibility = ["//visibility:public"])
-
-exports_files(["LICENSE"])
-
-cc_library(
-    name = "tensorrt_headers",
-    hdrs = [
-        "tensorrt/include/tensorrt_config.h",
-        ":tensorrt_include",
-    ],
-    include_prefix = "third_party/tensorrt",
-    strip_include_prefix = "tensorrt/include",
-)
-
-cc_library(
-    name = "tensorrt",
-    srcs = [":tensorrt_lib"],
-    copts = cuda_default_copts(),
-    data = [":tensorrt_lib"],
-    linkstatic = 1,
-    deps = [
-        ":tensorrt_headers",
-        "@local_config_cuda//cuda",
-    ],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    deps = [
-        "@bazel_skylib//lib:selects",
-    ],
-)
-
-genrule(
-    name = "tensorrt_lib",
-    outs = [
-        "tensorrt/lib/libnvinfer.so.5",
-        "tensorrt/lib/libnvinfer_plugin.so.5",
-    ],
-    cmd = """cp -f "/usr/lib64/libnvinfer.so.5" "$(location tensorrt/lib/libnvinfer.so.5)" && \
-cp -f "/usr/lib64/libnvinfer_plugin.so.5" "$(location tensorrt/lib/libnvinfer_plugin.so.5)" """,
-)
-
-genrule(
-    name = "tensorrt_include",
-    outs = [
-        "tensorrt/include/NvInfer.h",
-        "tensorrt/include/NvUtils.h",
-        "tensorrt/include/NvInferPlugin.h",
-    ],
-    cmd = """cp -f "/usr/include/NvInfer.h" "$(location tensorrt/include/NvInfer.h)" && \
-cp -f "/usr/include/NvUtils.h" "$(location tensorrt/include/NvUtils.h)" && \
-cp -f "/usr/include/NvInferPlugin.h" "$(location tensorrt/include/NvInferPlugin.h)" """,
-)
diff --git a/third_party/toolchains/preconfig/centos6/tensorrt5/LICENSE b/third_party/toolchains/preconfig/centos6/tensorrt5/LICENSE
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/third_party/toolchains/preconfig/centos6/tensorrt5/WORKSPACE b/third_party/toolchains/preconfig/centos6/tensorrt5/WORKSPACE
deleted file mode 100644
index ce47f14b91b..00000000000
--- a/third_party/toolchains/preconfig/centos6/tensorrt5/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for tensorrt_configure rule
-workspace(name = "local_config_tensorrt")
diff --git a/third_party/toolchains/preconfig/centos6/tensorrt5/build_defs.bzl b/third_party/toolchains/preconfig/centos6/tensorrt5/build_defs.bzl
deleted file mode 100755
index 527be938341..00000000000
--- a/third_party/toolchains/preconfig/centos6/tensorrt5/build_defs.bzl
+++ /dev/null
@@ -1,5 +0,0 @@
-# Build configurations for TensorRT.
-
-def if_tensorrt(if_true, if_false = []):
-    """Tests whether TensorRT was enabled during the configure process."""
-    return if_true
diff --git a/third_party/toolchains/preconfig/centos6/tensorrt5/tensorrt/include/tensorrt_config.h b/third_party/toolchains/preconfig/centos6/tensorrt5/tensorrt/include/tensorrt_config.h
deleted file mode 100644
index 02a166f4cd1..00000000000
--- a/third_party/toolchains/preconfig/centos6/tensorrt5/tensorrt/include/tensorrt_config.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORRT_TENSORRT_INCLUDE_CONFIG_H_
-#define TENSORRT_TENSORRT_INCLUDE_CONFIG_H_
-
-#define TF_TENSORRT_VERSION "5"
-
-#endif  // TENSORRT_TENSORRT_INCLUDE_CONFIG_H_
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/WORKSPACE b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/WORKSPACE
deleted file mode 100644
index b61f572d6d2..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for cuda_configure rule
-workspace(name = "local_config_cuda")
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
deleted file mode 100755
index 95ec02dd868..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
+++ /dev/null
@@ -1,1282 +0,0 @@
-load(":build_defs.bzl", "cuda_header_library")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
-
-package(default_visibility = ["//visibility:public"])
-
-config_setting(
-    name = "using_nvcc",
-    values = {
-        "define": "using_cuda_nvcc=true",
-    },
-)
-
-config_setting(
-    name = "using_clang",
-    values = {
-        "define": "using_cuda_clang=true",
-    },
-)
-
-# Equivalent to using_clang && -c opt.
-config_setting(
-    name = "using_clang_opt",
-    values = {
-        "define": "using_cuda_clang=true",
-        "compilation_mode": "opt",
-    },
-)
-
-config_setting(
-    name = "darwin",
-    values = {"cpu": "darwin"},
-)
-
-config_setting(
-    name = "freebsd",
-    values = {"cpu": "freebsd"},
-)
-
-cuda_header_library(
-    name = "cuda_headers",
-    hdrs = [
-        "cuda/cuda_config.h",
-        ":cuda-include",
-    ],
-    include_prefix = "third_party/gpus",
-    includes = [
-        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
-        "cuda/include",
-    ],
-)
-
-cc_library(
-    name = "cudart_static",
-    srcs = ["cuda/lib/libcudart_static.a"],
-    linkopts = select({
-        ":freebsd": [],
-        "//conditions:default": ["-ldl"],
-    }) + [
-        "-lpthread",
-        "-lrt",
-    ],
-)
-
-cc_library(
-    name = "cuda_driver",
-    srcs = ["cuda/lib/libcuda.so"],
-)
-
-cc_library(
-    name = "cudart",
-    srcs = ["cuda/lib/libcudart.so.10.0"],
-    data = ["cuda/lib/libcudart.so.10.0"],
-    linkstatic = 1,
-)
-
-cuda_header_library(
-    name = "cublas_headers",
-    hdrs = [":cublas-include"],
-    include_prefix = "third_party/gpus/cuda/include",
-    includes = ["cublas/include"],
-    strip_include_prefix = "cublas/include",
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cublas",
-    srcs = ["cuda/lib/libcublas.so.10.0"],
-    data = ["cuda/lib/libcublas.so.10.0"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cusolver",
-    srcs = ["cuda/lib/libcusolver.so.10.0"],
-    data = ["cuda/lib/libcusolver.so.10.0"],
-    linkopts = ["-lgomp"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cudnn",
-    srcs = ["cuda/lib/libcudnn.so.7"],
-    data = ["cuda/lib/libcudnn.so.7"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cudnn_header",
-    hdrs = [":cudnn-include"],
-    include_prefix = "third_party/gpus/cudnn",
-    strip_include_prefix = "cudnn/include",
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cufft",
-    srcs = ["cuda/lib/libcufft.so.10.0"],
-    data = ["cuda/lib/libcufft.so.10.0"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "curand",
-    srcs = ["cuda/lib/libcurand.so.10.0"],
-    data = ["cuda/lib/libcurand.so.10.0"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "cuda",
-    deps = [
-        ":cublas",
-        ":cuda_headers",
-        ":cudart",
-        ":cudnn",
-        ":cufft",
-        ":curand",
-    ],
-)
-
-cuda_header_library(
-    name = "cupti_headers",
-    hdrs = [":cuda-extras"],
-    include_prefix = "third_party/gpus",
-    includes = ["cuda/extras/CUPTI/include/"],
-    deps = [":cuda_headers"],
-)
-
-cc_library(
-    name = "cupti_dsos",
-    data = ["cuda/lib/libcupti.so.10.0"],
-)
-
-cc_library(
-    name = "cusparse",
-    srcs = ["cuda/lib/libcusparse.so.10.0"],
-    data = ["cuda/lib/libcusparse.so.10.0"],
-    linkopts = ["-lgomp"],
-    linkstatic = 1,
-)
-
-cc_library(
-    name = "libdevice_root",
-    data = [":cuda-nvvm"],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    deps = [
-        "@bazel_skylib//lib:selects",
-    ],
-)
-
-genrule(
-    name = "cuda-include",
-    outs = [
-        "cuda/include/CL/cl.h",
-        "cuda/include/CL/cl.hpp",
-        "cuda/include/CL/cl_egl.h",
-        "cuda/include/CL/cl_ext.h",
-        "cuda/include/CL/cl_gl.h",
-        "cuda/include/CL/cl_gl_ext.h",
-        "cuda/include/CL/cl_platform.h",
-        "cuda/include/CL/opencl.h",
-        "cuda/include/builtin_types.h",
-        "cuda/include/channel_descriptor.h",
-        "cuda/include/common_functions.h",
-        "cuda/include/cooperative_groups.h",
-        "cuda/include/cooperative_groups_helpers.h",
-        "cuda/include/crt/common_functions.h",
-        "cuda/include/crt/device_double_functions.h",
-        "cuda/include/crt/device_double_functions.hpp",
-        "cuda/include/crt/device_functions.h",
-        "cuda/include/crt/device_functions.hpp",
-        "cuda/include/crt/func_macro.h",
-        "cuda/include/crt/host_config.h",
-        "cuda/include/crt/host_defines.h",
-        "cuda/include/crt/host_runtime.h",
-        "cuda/include/crt/math_functions.h",
-        "cuda/include/crt/math_functions.hpp",
-        "cuda/include/crt/mma.h",
-        "cuda/include/crt/mma.hpp",
-        "cuda/include/crt/nvfunctional",
-        "cuda/include/crt/sm_70_rt.h",
-        "cuda/include/crt/sm_70_rt.hpp",
-        "cuda/include/crt/storage_class.h",
-        "cuda/include/cuComplex.h",
-        "cuda/include/cublas.h",
-        "cuda/include/cublasXt.h",
-        "cuda/include/cublas_api.h",
-        "cuda/include/cublas_v2.h",
-        "cuda/include/cuda.h",
-        "cuda/include/cudaEGL.h",
-        "cuda/include/cudaGL.h",
-        "cuda/include/cudaProfiler.h",
-        "cuda/include/cudaVDPAU.h",
-        "cuda/include/cuda_device_runtime_api.h",
-        "cuda/include/cuda_egl_interop.h",
-        "cuda/include/cuda_fp16.h",
-        "cuda/include/cuda_fp16.hpp",
-        "cuda/include/cuda_gl_interop.h",
-        "cuda/include/cuda_occupancy.h",
-        "cuda/include/cuda_profiler_api.h",
-        "cuda/include/cuda_runtime.h",
-        "cuda/include/cuda_runtime_api.h",
-        "cuda/include/cuda_surface_types.h",
-        "cuda/include/cuda_texture_types.h",
-        "cuda/include/cuda_vdpau_interop.h",
-        "cuda/include/cudalibxt.h",
-        "cuda/include/cudart_platform.h",
-        "cuda/include/cufft.h",
-        "cuda/include/cufftXt.h",
-        "cuda/include/cufftw.h",
-        "cuda/include/curand.h",
-        "cuda/include/curand_discrete.h",
-        "cuda/include/curand_discrete2.h",
-        "cuda/include/curand_globals.h",
-        "cuda/include/curand_kernel.h",
-        "cuda/include/curand_lognormal.h",
-        "cuda/include/curand_mrg32k3a.h",
-        "cuda/include/curand_mtgp32.h",
-        "cuda/include/curand_mtgp32_host.h",
-        "cuda/include/curand_mtgp32_kernel.h",
-        "cuda/include/curand_mtgp32dc_p_11213.h",
-        "cuda/include/curand_normal.h",
-        "cuda/include/curand_normal_static.h",
-        "cuda/include/curand_philox4x32_x.h",
-        "cuda/include/curand_poisson.h",
-        "cuda/include/curand_precalc.h",
-        "cuda/include/curand_uniform.h",
-        "cuda/include/cusolverDn.h",
-        "cuda/include/cusolverRf.h",
-        "cuda/include/cusolverSp.h",
-        "cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h",
-        "cuda/include/cusolver_common.h",
-        "cuda/include/cusparse.h",
-        "cuda/include/cusparse_v2.h",
-        "cuda/include/device_atomic_functions.h",
-        "cuda/include/device_atomic_functions.hpp",
-        "cuda/include/device_double_functions.h",
-        "cuda/include/device_functions.h",
-        "cuda/include/device_launch_parameters.h",
-        "cuda/include/device_types.h",
-        "cuda/include/driver_functions.h",
-        "cuda/include/driver_types.h",
-        "cuda/include/fatBinaryCtl.h",
-        "cuda/include/fatbinary.h",
-        "cuda/include/host_config.h",
-        "cuda/include/host_defines.h",
-        "cuda/include/library_types.h",
-        "cuda/include/math_constants.h",
-        "cuda/include/math_functions.h",
-        "cuda/include/mma.h",
-        "cuda/include/npp.h",
-        "cuda/include/nppcore.h",
-        "cuda/include/nppdefs.h",
-        "cuda/include/nppi.h",
-        "cuda/include/nppi_arithmetic_and_logical_operations.h",
-        "cuda/include/nppi_color_conversion.h",
-        "cuda/include/nppi_compression_functions.h",
-        "cuda/include/nppi_computer_vision.h",
-        "cuda/include/nppi_data_exchange_and_initialization.h",
-        "cuda/include/nppi_filtering_functions.h",
-        "cuda/include/nppi_geometry_transforms.h",
-        "cuda/include/nppi_linear_transforms.h",
-        "cuda/include/nppi_morphological_operations.h",
-        "cuda/include/nppi_statistics_functions.h",
-        "cuda/include/nppi_support_functions.h",
-        "cuda/include/nppi_threshold_and_compare_operations.h",
-        "cuda/include/npps.h",
-        "cuda/include/npps_arithmetic_and_logical_operations.h",
-        "cuda/include/npps_conversion_functions.h",
-        "cuda/include/npps_filtering_functions.h",
-        "cuda/include/npps_initialization.h",
-        "cuda/include/npps_statistics_functions.h",
-        "cuda/include/npps_support_functions.h",
-        "cuda/include/nppversion.h",
-        "cuda/include/nvToolsExt.h",
-        "cuda/include/nvToolsExtCuda.h",
-        "cuda/include/nvToolsExtCudaRt.h",
-        "cuda/include/nvToolsExtMeta.h",
-        "cuda/include/nvToolsExtSync.h",
-        "cuda/include/nvblas.h",
-        "cuda/include/nvfunctional",
-        "cuda/include/nvgraph.h",
-        "cuda/include/nvjpeg.h",
-        "cuda/include/nvml.h",
-        "cuda/include/nvrtc.h",
-        "cuda/include/nvtx3/nvToolsExt.h",
-        "cuda/include/nvtx3/nvToolsExtCuda.h",
-        "cuda/include/nvtx3/nvToolsExtCudaRt.h",
-        "cuda/include/nvtx3/nvToolsExtOpenCL.h",
-        "cuda/include/nvtx3/nvToolsExtSync.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImpl.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplCore.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxInit.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxInitDecls.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxInitDefs.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxLinkOnce.h",
-        "cuda/include/nvtx3/nvtxDetail/nvtxTypes.h",
-        "cuda/include/sm_20_atomic_functions.h",
-        "cuda/include/sm_20_atomic_functions.hpp",
-        "cuda/include/sm_20_intrinsics.h",
-        "cuda/include/sm_20_intrinsics.hpp",
-        "cuda/include/sm_30_intrinsics.h",
-        "cuda/include/sm_30_intrinsics.hpp",
-        "cuda/include/sm_32_atomic_functions.h",
-        "cuda/include/sm_32_atomic_functions.hpp",
-        "cuda/include/sm_32_intrinsics.h",
-        "cuda/include/sm_32_intrinsics.hpp",
-        "cuda/include/sm_35_atomic_functions.h",
-        "cuda/include/sm_35_intrinsics.h",
-        "cuda/include/sm_60_atomic_functions.h",
-        "cuda/include/sm_60_atomic_functions.hpp",
-        "cuda/include/sm_61_intrinsics.h",
-        "cuda/include/sm_61_intrinsics.hpp",
-        "cuda/include/sobol_direction_vectors.h",
-        "cuda/include/surface_functions.h",
-        "cuda/include/surface_functions.hpp",
-        "cuda/include/surface_indirect_functions.h",
-        "cuda/include/surface_indirect_functions.hpp",
-        "cuda/include/surface_types.h",
-        "cuda/include/texture_fetch_functions.h",
-        "cuda/include/texture_fetch_functions.hpp",
-        "cuda/include/texture_indirect_functions.h",
-        "cuda/include/texture_indirect_functions.hpp",
-        "cuda/include/texture_types.h",
-        "cuda/include/thrust/adjacent_difference.h",
-        "cuda/include/thrust/advance.h",
-        "cuda/include/thrust/binary_search.h",
-        "cuda/include/thrust/complex.h",
-        "cuda/include/thrust/copy.h",
-        "cuda/include/thrust/count.h",
-        "cuda/include/thrust/detail/adjacent_difference.inl",
-        "cuda/include/thrust/detail/advance.inl",
-        "cuda/include/thrust/detail/alignment.h",
-        "cuda/include/thrust/detail/allocator/allocator_traits.h",
-        "cuda/include/thrust/detail/allocator/allocator_traits.inl",
-        "cuda/include/thrust/detail/allocator/copy_construct_range.h",
-        "cuda/include/thrust/detail/allocator/copy_construct_range.inl",
-        "cuda/include/thrust/detail/allocator/default_construct_range.h",
-        "cuda/include/thrust/detail/allocator/default_construct_range.inl",
-        "cuda/include/thrust/detail/allocator/destroy_range.h",
-        "cuda/include/thrust/detail/allocator/destroy_range.inl",
-        "cuda/include/thrust/detail/allocator/fill_construct_range.h",
-        "cuda/include/thrust/detail/allocator/fill_construct_range.inl",
-        "cuda/include/thrust/detail/allocator/malloc_allocator.h",
-        "cuda/include/thrust/detail/allocator/malloc_allocator.inl",
-        "cuda/include/thrust/detail/allocator/no_throw_allocator.h",
-        "cuda/include/thrust/detail/allocator/tagged_allocator.h",
-        "cuda/include/thrust/detail/allocator/tagged_allocator.inl",
-        "cuda/include/thrust/detail/allocator/temporary_allocator.h",
-        "cuda/include/thrust/detail/allocator/temporary_allocator.inl",
-        "cuda/include/thrust/detail/binary_search.inl",
-        "cuda/include/thrust/detail/complex/arithmetic.h",
-        "cuda/include/thrust/detail/complex/c99math.h",
-        "cuda/include/thrust/detail/complex/catrig.h",
-        "cuda/include/thrust/detail/complex/catrigf.h",
-        "cuda/include/thrust/detail/complex/ccosh.h",
-        "cuda/include/thrust/detail/complex/ccoshf.h",
-        "cuda/include/thrust/detail/complex/cexp.h",
-        "cuda/include/thrust/detail/complex/cexpf.h",
-        "cuda/include/thrust/detail/complex/clog.h",
-        "cuda/include/thrust/detail/complex/clogf.h",
-        "cuda/include/thrust/detail/complex/complex.inl",
-        "cuda/include/thrust/detail/complex/cpow.h",
-        "cuda/include/thrust/detail/complex/cproj.h",
-        "cuda/include/thrust/detail/complex/csinh.h",
-        "cuda/include/thrust/detail/complex/csinhf.h",
-        "cuda/include/thrust/detail/complex/csqrt.h",
-        "cuda/include/thrust/detail/complex/csqrtf.h",
-        "cuda/include/thrust/detail/complex/ctanh.h",
-        "cuda/include/thrust/detail/complex/ctanhf.h",
-        "cuda/include/thrust/detail/complex/math_private.h",
-        "cuda/include/thrust/detail/complex/stream.h",
-        "cuda/include/thrust/detail/config.h",
-        "cuda/include/thrust/detail/config/compiler.h",
-        "cuda/include/thrust/detail/config/compiler_fence.h",
-        "cuda/include/thrust/detail/config/config.h",
-        "cuda/include/thrust/detail/config/debug.h",
-        "cuda/include/thrust/detail/config/device_system.h",
-        "cuda/include/thrust/detail/config/exec_check_disable.h",
-        "cuda/include/thrust/detail/config/forceinline.h",
-        "cuda/include/thrust/detail/config/global_workarounds.h",
-        "cuda/include/thrust/detail/config/host_device.h",
-        "cuda/include/thrust/detail/config/host_system.h",
-        "cuda/include/thrust/detail/config/simple_defines.h",
-        "cuda/include/thrust/detail/contiguous_storage.h",
-        "cuda/include/thrust/detail/contiguous_storage.inl",
-        "cuda/include/thrust/detail/copy.h",
-        "cuda/include/thrust/detail/copy.inl",
-        "cuda/include/thrust/detail/copy_if.h",
-        "cuda/include/thrust/detail/copy_if.inl",
-        "cuda/include/thrust/detail/count.inl",
-        "cuda/include/thrust/detail/cstdint.h",
-        "cuda/include/thrust/detail/device_delete.inl",
-        "cuda/include/thrust/detail/device_free.inl",
-        "cuda/include/thrust/detail/device_malloc.inl",
-        "cuda/include/thrust/detail/device_new.inl",
-        "cuda/include/thrust/detail/device_ptr.inl",
-        "cuda/include/thrust/detail/device_reference.inl",
-        "cuda/include/thrust/detail/device_vector.inl",
-        "cuda/include/thrust/detail/dispatch/is_trivial_copy.h",
-        "cuda/include/thrust/detail/distance.inl",
-        "cuda/include/thrust/detail/equal.inl",
-        "cuda/include/thrust/detail/execute_with_allocator.h",
-        "cuda/include/thrust/detail/execution_policy.h",
-        "cuda/include/thrust/detail/extrema.inl",
-        "cuda/include/thrust/detail/fill.inl",
-        "cuda/include/thrust/detail/find.inl",
-        "cuda/include/thrust/detail/for_each.inl",
-        "cuda/include/thrust/detail/function.h",
-        "cuda/include/thrust/detail/functional.inl",
-        "cuda/include/thrust/detail/functional/actor.h",
-        "cuda/include/thrust/detail/functional/actor.inl",
-        "cuda/include/thrust/detail/functional/argument.h",
-        "cuda/include/thrust/detail/functional/composite.h",
-        "cuda/include/thrust/detail/functional/operators.h",
-        "cuda/include/thrust/detail/functional/operators/arithmetic_operators.h",
-        "cuda/include/thrust/detail/functional/operators/assignment_operator.h",
-        "cuda/include/thrust/detail/functional/operators/bitwise_operators.h",
-        "cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h",
-        "cuda/include/thrust/detail/functional/operators/logical_operators.h",
-        "cuda/include/thrust/detail/functional/operators/operator_adaptors.h",
-        "cuda/include/thrust/detail/functional/operators/relational_operators.h",
-        "cuda/include/thrust/detail/functional/placeholder.h",
-        "cuda/include/thrust/detail/functional/value.h",
-        "cuda/include/thrust/detail/gather.inl",
-        "cuda/include/thrust/detail/generate.inl",
-        "cuda/include/thrust/detail/get_iterator_value.h",
-        "cuda/include/thrust/detail/host_vector.inl",
-        "cuda/include/thrust/detail/inner_product.inl",
-        "cuda/include/thrust/detail/integer_math.h",
-        "cuda/include/thrust/detail/integer_traits.h",
-        "cuda/include/thrust/detail/internal_functional.h",
-        "cuda/include/thrust/detail/logical.inl",
-        "cuda/include/thrust/detail/malloc_and_free.h",
-        "cuda/include/thrust/detail/merge.inl",
-        "cuda/include/thrust/detail/minmax.h",
-        "cuda/include/thrust/detail/mismatch.inl",
-        "cuda/include/thrust/detail/mpl/math.h",
-        "cuda/include/thrust/detail/numeric_traits.h",
-        "cuda/include/thrust/detail/overlapped_copy.h",
-        "cuda/include/thrust/detail/pair.inl",
-        "cuda/include/thrust/detail/partition.inl",
-        "cuda/include/thrust/detail/pointer.h",
-        "cuda/include/thrust/detail/pointer.inl",
-        "cuda/include/thrust/detail/preprocessor.h",
-        "cuda/include/thrust/detail/range/head_flags.h",
-        "cuda/include/thrust/detail/range/tail_flags.h",
-        "cuda/include/thrust/detail/raw_pointer_cast.h",
-        "cuda/include/thrust/detail/raw_reference_cast.h",
-        "cuda/include/thrust/detail/reduce.inl",
-        "cuda/include/thrust/detail/reference.h",
-        "cuda/include/thrust/detail/reference.inl",
-        "cuda/include/thrust/detail/reference_forward_declaration.h",
-        "cuda/include/thrust/detail/remove.inl",
-        "cuda/include/thrust/detail/replace.inl",
-        "cuda/include/thrust/detail/reverse.inl",
-        "cuda/include/thrust/detail/scan.inl",
-        "cuda/include/thrust/detail/scatter.inl",
-        "cuda/include/thrust/detail/seq.h",
-        "cuda/include/thrust/detail/sequence.inl",
-        "cuda/include/thrust/detail/set_operations.inl",
-        "cuda/include/thrust/detail/sort.inl",
-        "cuda/include/thrust/detail/static_assert.h",
-        "cuda/include/thrust/detail/static_map.h",
-        "cuda/include/thrust/detail/swap.h",
-        "cuda/include/thrust/detail/swap.inl",
-        "cuda/include/thrust/detail/swap_ranges.inl",
-        "cuda/include/thrust/detail/tabulate.inl",
-        "cuda/include/thrust/detail/temporary_array.h",
-        "cuda/include/thrust/detail/temporary_array.inl",
-        "cuda/include/thrust/detail/temporary_buffer.h",
-        "cuda/include/thrust/detail/transform.inl",
-        "cuda/include/thrust/detail/transform_reduce.inl",
-        "cuda/include/thrust/detail/transform_scan.inl",
-        "cuda/include/thrust/detail/trivial_sequence.h",
-        "cuda/include/thrust/detail/tuple.inl",
-        "cuda/include/thrust/detail/tuple_meta_transform.h",
-        "cuda/include/thrust/detail/tuple_transform.h",
-        "cuda/include/thrust/detail/type_traits.h",
-        "cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h",
-        "cuda/include/thrust/detail/type_traits/function_traits.h",
-        "cuda/include/thrust/detail/type_traits/has_member_function.h",
-        "cuda/include/thrust/detail/type_traits/has_nested_type.h",
-        "cuda/include/thrust/detail/type_traits/has_trivial_assign.h",
-        "cuda/include/thrust/detail/type_traits/is_call_possible.h",
-        "cuda/include/thrust/detail/type_traits/is_metafunction_defined.h",
-        "cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h",
-        "cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h",
-        "cuda/include/thrust/detail/type_traits/minimum_type.h",
-        "cuda/include/thrust/detail/type_traits/pointer_traits.h",
-        "cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h",
-        "cuda/include/thrust/detail/uninitialized_copy.inl",
-        "cuda/include/thrust/detail/uninitialized_fill.inl",
-        "cuda/include/thrust/detail/unique.inl",
-        "cuda/include/thrust/detail/use_default.h",
-        "cuda/include/thrust/detail/util/align.h",
-        "cuda/include/thrust/detail/util/blocking.h",
-        "cuda/include/thrust/detail/vector_base.h",
-        "cuda/include/thrust/detail/vector_base.inl",
-        "cuda/include/thrust/device_allocator.h",
-        "cuda/include/thrust/device_delete.h",
-        "cuda/include/thrust/device_free.h",
-        "cuda/include/thrust/device_malloc.h",
-        "cuda/include/thrust/device_malloc_allocator.h",
-        "cuda/include/thrust/device_new.h",
-        "cuda/include/thrust/device_new_allocator.h",
-        "cuda/include/thrust/device_ptr.h",
-        "cuda/include/thrust/device_reference.h",
-        "cuda/include/thrust/device_vector.h",
-        "cuda/include/thrust/distance.h",
-        "cuda/include/thrust/equal.h",
-        "cuda/include/thrust/execution_policy.h",
-        "cuda/include/thrust/extrema.h",
-        "cuda/include/thrust/fill.h",
-        "cuda/include/thrust/find.h",
-        "cuda/include/thrust/for_each.h",
-        "cuda/include/thrust/functional.h",
-        "cuda/include/thrust/gather.h",
-        "cuda/include/thrust/generate.h",
-        "cuda/include/thrust/host_vector.h",
-        "cuda/include/thrust/inner_product.h",
-        "cuda/include/thrust/iterator/constant_iterator.h",
-        "cuda/include/thrust/iterator/counting_iterator.h",
-        "cuda/include/thrust/iterator/detail/any_assign.h",
-        "cuda/include/thrust/iterator/detail/any_system_tag.h",
-        "cuda/include/thrust/iterator/detail/constant_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/counting_iterator.inl",
-        "cuda/include/thrust/iterator/detail/device_system_tag.h",
-        "cuda/include/thrust/iterator/detail/discard_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/distance_from_result.h",
-        "cuda/include/thrust/iterator/detail/host_system_tag.h",
-        "cuda/include/thrust/iterator/detail/is_iterator_category.h",
-        "cuda/include/thrust/iterator/detail/is_trivial_iterator.h",
-        "cuda/include/thrust/iterator/detail/iterator_adaptor_base.h",
-        "cuda/include/thrust/iterator/detail/iterator_category_to_system.h",
-        "cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h",
-        "cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h",
-        "cuda/include/thrust/iterator/detail/iterator_facade_category.h",
-        "cuda/include/thrust/iterator/detail/iterator_traits.inl",
-        "cuda/include/thrust/iterator/detail/iterator_traversal_tags.h",
-        "cuda/include/thrust/iterator/detail/join_iterator.h",
-        "cuda/include/thrust/iterator/detail/minimum_category.h",
-        "cuda/include/thrust/iterator/detail/minimum_system.h",
-        "cuda/include/thrust/iterator/detail/normal_iterator.h",
-        "cuda/include/thrust/iterator/detail/permutation_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/retag.h",
-        "cuda/include/thrust/iterator/detail/reverse_iterator.inl",
-        "cuda/include/thrust/iterator/detail/reverse_iterator_base.h",
-        "cuda/include/thrust/iterator/detail/tagged_iterator.h",
-        "cuda/include/thrust/iterator/detail/transform_iterator.inl",
-        "cuda/include/thrust/iterator/detail/transform_output_iterator.inl",
-        "cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h",
-        "cuda/include/thrust/iterator/detail/universal_categories.h",
-        "cuda/include/thrust/iterator/detail/zip_iterator.inl",
-        "cuda/include/thrust/iterator/detail/zip_iterator_base.h",
-        "cuda/include/thrust/iterator/discard_iterator.h",
-        "cuda/include/thrust/iterator/iterator_adaptor.h",
-        "cuda/include/thrust/iterator/iterator_categories.h",
-        "cuda/include/thrust/iterator/iterator_facade.h",
-        "cuda/include/thrust/iterator/iterator_traits.h",
-        "cuda/include/thrust/iterator/permutation_iterator.h",
-        "cuda/include/thrust/iterator/retag.h",
-        "cuda/include/thrust/iterator/reverse_iterator.h",
-        "cuda/include/thrust/iterator/transform_iterator.h",
-        "cuda/include/thrust/iterator/transform_output_iterator.h",
-        "cuda/include/thrust/iterator/zip_iterator.h",
-        "cuda/include/thrust/logical.h",
-        "cuda/include/thrust/memory.h",
-        "cuda/include/thrust/merge.h",
-        "cuda/include/thrust/mismatch.h",
-        "cuda/include/thrust/pair.h",
-        "cuda/include/thrust/partition.h",
-        "cuda/include/thrust/random.h",
-        "cuda/include/thrust/random/detail/discard_block_engine.inl",
-        "cuda/include/thrust/random/detail/linear_congruential_engine.inl",
-        "cuda/include/thrust/random/detail/linear_congruential_engine_discard.h",
-        "cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl",
-        "cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h",
-        "cuda/include/thrust/random/detail/mod.h",
-        "cuda/include/thrust/random/detail/normal_distribution.inl",
-        "cuda/include/thrust/random/detail/normal_distribution_base.h",
-        "cuda/include/thrust/random/detail/random_core_access.h",
-        "cuda/include/thrust/random/detail/subtract_with_carry_engine.inl",
-        "cuda/include/thrust/random/detail/uniform_int_distribution.inl",
-        "cuda/include/thrust/random/detail/uniform_real_distribution.inl",
-        "cuda/include/thrust/random/detail/xor_combine_engine.inl",
-        "cuda/include/thrust/random/detail/xor_combine_engine_max.h",
-        "cuda/include/thrust/random/discard_block_engine.h",
-        "cuda/include/thrust/random/linear_congruential_engine.h",
-        "cuda/include/thrust/random/linear_feedback_shift_engine.h",
-        "cuda/include/thrust/random/normal_distribution.h",
-        "cuda/include/thrust/random/subtract_with_carry_engine.h",
-        "cuda/include/thrust/random/uniform_int_distribution.h",
-        "cuda/include/thrust/random/uniform_real_distribution.h",
-        "cuda/include/thrust/random/xor_combine_engine.h",
-        "cuda/include/thrust/reduce.h",
-        "cuda/include/thrust/remove.h",
-        "cuda/include/thrust/replace.h",
-        "cuda/include/thrust/reverse.h",
-        "cuda/include/thrust/scan.h",
-        "cuda/include/thrust/scatter.h",
-        "cuda/include/thrust/sequence.h",
-        "cuda/include/thrust/set_operations.h",
-        "cuda/include/thrust/sort.h",
-        "cuda/include/thrust/swap.h",
-        "cuda/include/thrust/system/cpp/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/cpp/detail/assign_value.h",
-        "cuda/include/thrust/system/cpp/detail/binary_search.h",
-        "cuda/include/thrust/system/cpp/detail/copy.h",
-        "cuda/include/thrust/system/cpp/detail/copy_if.h",
-        "cuda/include/thrust/system/cpp/detail/count.h",
-        "cuda/include/thrust/system/cpp/detail/equal.h",
-        "cuda/include/thrust/system/cpp/detail/execution_policy.h",
-        "cuda/include/thrust/system/cpp/detail/extrema.h",
-        "cuda/include/thrust/system/cpp/detail/fill.h",
-        "cuda/include/thrust/system/cpp/detail/find.h",
-        "cuda/include/thrust/system/cpp/detail/for_each.h",
-        "cuda/include/thrust/system/cpp/detail/gather.h",
-        "cuda/include/thrust/system/cpp/detail/generate.h",
-        "cuda/include/thrust/system/cpp/detail/get_value.h",
-        "cuda/include/thrust/system/cpp/detail/inner_product.h",
-        "cuda/include/thrust/system/cpp/detail/iter_swap.h",
-        "cuda/include/thrust/system/cpp/detail/logical.h",
-        "cuda/include/thrust/system/cpp/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/cpp/detail/memory.inl",
-        "cuda/include/thrust/system/cpp/detail/merge.h",
-        "cuda/include/thrust/system/cpp/detail/mismatch.h",
-        "cuda/include/thrust/system/cpp/detail/par.h",
-        "cuda/include/thrust/system/cpp/detail/partition.h",
-        "cuda/include/thrust/system/cpp/detail/reduce.h",
-        "cuda/include/thrust/system/cpp/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/cpp/detail/remove.h",
-        "cuda/include/thrust/system/cpp/detail/replace.h",
-        "cuda/include/thrust/system/cpp/detail/reverse.h",
-        "cuda/include/thrust/system/cpp/detail/scan.h",
-        "cuda/include/thrust/system/cpp/detail/scan_by_key.h",
-        "cuda/include/thrust/system/cpp/detail/scatter.h",
-        "cuda/include/thrust/system/cpp/detail/sequence.h",
-        "cuda/include/thrust/system/cpp/detail/set_operations.h",
-        "cuda/include/thrust/system/cpp/detail/sort.h",
-        "cuda/include/thrust/system/cpp/detail/swap_ranges.h",
-        "cuda/include/thrust/system/cpp/detail/tabulate.h",
-        "cuda/include/thrust/system/cpp/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/cpp/detail/transform.h",
-        "cuda/include/thrust/system/cpp/detail/transform_reduce.h",
-        "cuda/include/thrust/system/cpp/detail/transform_scan.h",
-        "cuda/include/thrust/system/cpp/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/cpp/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/cpp/detail/unique.h",
-        "cuda/include/thrust/system/cpp/detail/unique_by_key.h",
-        "cuda/include/thrust/system/cpp/detail/vector.inl",
-        "cuda/include/thrust/system/cpp/execution_policy.h",
-        "cuda/include/thrust/system/cpp/memory.h",
-        "cuda/include/thrust/system/cpp/vector.h",
-        "cuda/include/thrust/system/cuda/config.h",
-        "cuda/include/thrust/system/cuda/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/cuda/detail/assign_value.h",
-        "cuda/include/thrust/system/cuda/detail/binary_search.h",
-        "cuda/include/thrust/system/cuda/detail/copy.h",
-        "cuda/include/thrust/system/cuda/detail/copy_if.h",
-        "cuda/include/thrust/system/cuda/detail/core/agent_launcher.h",
-        "cuda/include/thrust/system/cuda/detail/core/alignment.h",
-        "cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h",
-        "cuda/include/thrust/system/cuda/detail/core/util.h",
-        "cuda/include/thrust/system/cuda/detail/count.h",
-        "cuda/include/thrust/system/cuda/detail/cross_system.h",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/cub.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_device.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/util_type.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh",
-        "cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh",
-        "cuda/include/thrust/system/cuda/detail/equal.h",
-        "cuda/include/thrust/system/cuda/detail/error.inl",
-        "cuda/include/thrust/system/cuda/detail/execution_policy.h",
-        "cuda/include/thrust/system/cuda/detail/extrema.h",
-        "cuda/include/thrust/system/cuda/detail/fill.h",
-        "cuda/include/thrust/system/cuda/detail/find.h",
-        "cuda/include/thrust/system/cuda/detail/for_each.h",
-        "cuda/include/thrust/system/cuda/detail/gather.h",
-        "cuda/include/thrust/system/cuda/detail/generate.h",
-        "cuda/include/thrust/system/cuda/detail/get_value.h",
-        "cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h",
-        "cuda/include/thrust/system/cuda/detail/guarded_driver_types.h",
-        "cuda/include/thrust/system/cuda/detail/inner_product.h",
-        "cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h",
-        "cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h",
-        "cuda/include/thrust/system/cuda/detail/iter_swap.h",
-        "cuda/include/thrust/system/cuda/detail/logical.h",
-        "cuda/include/thrust/system/cuda/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/cuda/detail/memory.inl",
-        "cuda/include/thrust/system/cuda/detail/merge.h",
-        "cuda/include/thrust/system/cuda/detail/mismatch.h",
-        "cuda/include/thrust/system/cuda/detail/par.h",
-        "cuda/include/thrust/system/cuda/detail/par_to_seq.h",
-        "cuda/include/thrust/system/cuda/detail/parallel_for.h",
-        "cuda/include/thrust/system/cuda/detail/partition.h",
-        "cuda/include/thrust/system/cuda/detail/reduce.h",
-        "cuda/include/thrust/system/cuda/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/cuda/detail/remove.h",
-        "cuda/include/thrust/system/cuda/detail/replace.h",
-        "cuda/include/thrust/system/cuda/detail/reverse.h",
-        "cuda/include/thrust/system/cuda/detail/scan.h",
-        "cuda/include/thrust/system/cuda/detail/scan_by_key.h",
-        "cuda/include/thrust/system/cuda/detail/scatter.h",
-        "cuda/include/thrust/system/cuda/detail/sequence.h",
-        "cuda/include/thrust/system/cuda/detail/set_operations.h",
-        "cuda/include/thrust/system/cuda/detail/sort.h",
-        "cuda/include/thrust/system/cuda/detail/swap_ranges.h",
-        "cuda/include/thrust/system/cuda/detail/tabulate.h",
-        "cuda/include/thrust/system/cuda/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/cuda/detail/terminate.h",
-        "cuda/include/thrust/system/cuda/detail/transform.h",
-        "cuda/include/thrust/system/cuda/detail/transform_reduce.h",
-        "cuda/include/thrust/system/cuda/detail/transform_scan.h",
-        "cuda/include/thrust/system/cuda/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/cuda/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/cuda/detail/unique.h",
-        "cuda/include/thrust/system/cuda/detail/unique_by_key.h",
-        "cuda/include/thrust/system/cuda/detail/util.h",
-        "cuda/include/thrust/system/cuda/detail/vector.inl",
-        "cuda/include/thrust/system/cuda/error.h",
-        "cuda/include/thrust/system/cuda/execution_policy.h",
-        "cuda/include/thrust/system/cuda/experimental/pinned_allocator.h",
-        "cuda/include/thrust/system/cuda/memory.h",
-        "cuda/include/thrust/system/cuda/vector.h",
-        "cuda/include/thrust/system/detail/adl/adjacent_difference.h",
-        "cuda/include/thrust/system/detail/adl/assign_value.h",
-        "cuda/include/thrust/system/detail/adl/binary_search.h",
-        "cuda/include/thrust/system/detail/adl/copy.h",
-        "cuda/include/thrust/system/detail/adl/copy_if.h",
-        "cuda/include/thrust/system/detail/adl/count.h",
-        "cuda/include/thrust/system/detail/adl/equal.h",
-        "cuda/include/thrust/system/detail/adl/extrema.h",
-        "cuda/include/thrust/system/detail/adl/fill.h",
-        "cuda/include/thrust/system/detail/adl/find.h",
-        "cuda/include/thrust/system/detail/adl/for_each.h",
-        "cuda/include/thrust/system/detail/adl/gather.h",
-        "cuda/include/thrust/system/detail/adl/generate.h",
-        "cuda/include/thrust/system/detail/adl/get_value.h",
-        "cuda/include/thrust/system/detail/adl/inner_product.h",
-        "cuda/include/thrust/system/detail/adl/iter_swap.h",
-        "cuda/include/thrust/system/detail/adl/logical.h",
-        "cuda/include/thrust/system/detail/adl/malloc_and_free.h",
-        "cuda/include/thrust/system/detail/adl/merge.h",
-        "cuda/include/thrust/system/detail/adl/mismatch.h",
-        "cuda/include/thrust/system/detail/adl/partition.h",
-        "cuda/include/thrust/system/detail/adl/reduce.h",
-        "cuda/include/thrust/system/detail/adl/reduce_by_key.h",
-        "cuda/include/thrust/system/detail/adl/remove.h",
-        "cuda/include/thrust/system/detail/adl/replace.h",
-        "cuda/include/thrust/system/detail/adl/reverse.h",
-        "cuda/include/thrust/system/detail/adl/scan.h",
-        "cuda/include/thrust/system/detail/adl/scan_by_key.h",
-        "cuda/include/thrust/system/detail/adl/scatter.h",
-        "cuda/include/thrust/system/detail/adl/sequence.h",
-        "cuda/include/thrust/system/detail/adl/set_operations.h",
-        "cuda/include/thrust/system/detail/adl/sort.h",
-        "cuda/include/thrust/system/detail/adl/swap_ranges.h",
-        "cuda/include/thrust/system/detail/adl/tabulate.h",
-        "cuda/include/thrust/system/detail/adl/temporary_buffer.h",
-        "cuda/include/thrust/system/detail/adl/transform.h",
-        "cuda/include/thrust/system/detail/adl/transform_reduce.h",
-        "cuda/include/thrust/system/detail/adl/transform_scan.h",
-        "cuda/include/thrust/system/detail/adl/uninitialized_copy.h",
-        "cuda/include/thrust/system/detail/adl/uninitialized_fill.h",
-        "cuda/include/thrust/system/detail/adl/unique.h",
-        "cuda/include/thrust/system/detail/adl/unique_by_key.h",
-        "cuda/include/thrust/system/detail/bad_alloc.h",
-        "cuda/include/thrust/system/detail/errno.h",
-        "cuda/include/thrust/system/detail/error_category.inl",
-        "cuda/include/thrust/system/detail/error_code.inl",
-        "cuda/include/thrust/system/detail/error_condition.inl",
-        "cuda/include/thrust/system/detail/generic/adjacent_difference.h",
-        "cuda/include/thrust/system/detail/generic/adjacent_difference.inl",
-        "cuda/include/thrust/system/detail/generic/advance.h",
-        "cuda/include/thrust/system/detail/generic/advance.inl",
-        "cuda/include/thrust/system/detail/generic/binary_search.h",
-        "cuda/include/thrust/system/detail/generic/binary_search.inl",
-        "cuda/include/thrust/system/detail/generic/copy.h",
-        "cuda/include/thrust/system/detail/generic/copy.inl",
-        "cuda/include/thrust/system/detail/generic/copy_if.h",
-        "cuda/include/thrust/system/detail/generic/copy_if.inl",
-        "cuda/include/thrust/system/detail/generic/count.h",
-        "cuda/include/thrust/system/detail/generic/count.inl",
-        "cuda/include/thrust/system/detail/generic/distance.h",
-        "cuda/include/thrust/system/detail/generic/distance.inl",
-        "cuda/include/thrust/system/detail/generic/equal.h",
-        "cuda/include/thrust/system/detail/generic/equal.inl",
-        "cuda/include/thrust/system/detail/generic/extrema.h",
-        "cuda/include/thrust/system/detail/generic/extrema.inl",
-        "cuda/include/thrust/system/detail/generic/fill.h",
-        "cuda/include/thrust/system/detail/generic/find.h",
-        "cuda/include/thrust/system/detail/generic/find.inl",
-        "cuda/include/thrust/system/detail/generic/for_each.h",
-        "cuda/include/thrust/system/detail/generic/gather.h",
-        "cuda/include/thrust/system/detail/generic/gather.inl",
-        "cuda/include/thrust/system/detail/generic/generate.h",
-        "cuda/include/thrust/system/detail/generic/generate.inl",
-        "cuda/include/thrust/system/detail/generic/inner_product.h",
-        "cuda/include/thrust/system/detail/generic/inner_product.inl",
-        "cuda/include/thrust/system/detail/generic/logical.h",
-        "cuda/include/thrust/system/detail/generic/memory.h",
-        "cuda/include/thrust/system/detail/generic/memory.inl",
-        "cuda/include/thrust/system/detail/generic/merge.h",
-        "cuda/include/thrust/system/detail/generic/merge.inl",
-        "cuda/include/thrust/system/detail/generic/mismatch.h",
-        "cuda/include/thrust/system/detail/generic/mismatch.inl",
-        "cuda/include/thrust/system/detail/generic/partition.h",
-        "cuda/include/thrust/system/detail/generic/partition.inl",
-        "cuda/include/thrust/system/detail/generic/reduce.h",
-        "cuda/include/thrust/system/detail/generic/reduce.inl",
-        "cuda/include/thrust/system/detail/generic/reduce_by_key.h",
-        "cuda/include/thrust/system/detail/generic/reduce_by_key.inl",
-        "cuda/include/thrust/system/detail/generic/remove.h",
-        "cuda/include/thrust/system/detail/generic/remove.inl",
-        "cuda/include/thrust/system/detail/generic/replace.h",
-        "cuda/include/thrust/system/detail/generic/replace.inl",
-        "cuda/include/thrust/system/detail/generic/reverse.h",
-        "cuda/include/thrust/system/detail/generic/reverse.inl",
-        "cuda/include/thrust/system/detail/generic/scalar/binary_search.h",
-        "cuda/include/thrust/system/detail/generic/scalar/binary_search.inl",
-        "cuda/include/thrust/system/detail/generic/scan.h",
-        "cuda/include/thrust/system/detail/generic/scan.inl",
-        "cuda/include/thrust/system/detail/generic/scan_by_key.h",
-        "cuda/include/thrust/system/detail/generic/scan_by_key.inl",
-        "cuda/include/thrust/system/detail/generic/scatter.h",
-        "cuda/include/thrust/system/detail/generic/scatter.inl",
-        "cuda/include/thrust/system/detail/generic/select_system.h",
-        "cuda/include/thrust/system/detail/generic/sequence.h",
-        "cuda/include/thrust/system/detail/generic/sequence.inl",
-        "cuda/include/thrust/system/detail/generic/set_operations.h",
-        "cuda/include/thrust/system/detail/generic/set_operations.inl",
-        "cuda/include/thrust/system/detail/generic/sort.h",
-        "cuda/include/thrust/system/detail/generic/sort.inl",
-        "cuda/include/thrust/system/detail/generic/swap_ranges.h",
-        "cuda/include/thrust/system/detail/generic/swap_ranges.inl",
-        "cuda/include/thrust/system/detail/generic/tabulate.h",
-        "cuda/include/thrust/system/detail/generic/tabulate.inl",
-        "cuda/include/thrust/system/detail/generic/tag.h",
-        "cuda/include/thrust/system/detail/generic/temporary_buffer.h",
-        "cuda/include/thrust/system/detail/generic/temporary_buffer.inl",
-        "cuda/include/thrust/system/detail/generic/transform.h",
-        "cuda/include/thrust/system/detail/generic/transform.inl",
-        "cuda/include/thrust/system/detail/generic/transform_reduce.h",
-        "cuda/include/thrust/system/detail/generic/transform_reduce.inl",
-        "cuda/include/thrust/system/detail/generic/transform_scan.h",
-        "cuda/include/thrust/system/detail/generic/transform_scan.inl",
-        "cuda/include/thrust/system/detail/generic/type_traits.h",
-        "cuda/include/thrust/system/detail/generic/uninitialized_copy.h",
-        "cuda/include/thrust/system/detail/generic/uninitialized_copy.inl",
-        "cuda/include/thrust/system/detail/generic/uninitialized_fill.h",
-        "cuda/include/thrust/system/detail/generic/uninitialized_fill.inl",
-        "cuda/include/thrust/system/detail/generic/unique.h",
-        "cuda/include/thrust/system/detail/generic/unique.inl",
-        "cuda/include/thrust/system/detail/generic/unique_by_key.h",
-        "cuda/include/thrust/system/detail/generic/unique_by_key.inl",
-        "cuda/include/thrust/system/detail/internal/decompose.h",
-        "cuda/include/thrust/system/detail/sequential/adjacent_difference.h",
-        "cuda/include/thrust/system/detail/sequential/assign_value.h",
-        "cuda/include/thrust/system/detail/sequential/binary_search.h",
-        "cuda/include/thrust/system/detail/sequential/copy.h",
-        "cuda/include/thrust/system/detail/sequential/copy.inl",
-        "cuda/include/thrust/system/detail/sequential/copy_backward.h",
-        "cuda/include/thrust/system/detail/sequential/copy_if.h",
-        "cuda/include/thrust/system/detail/sequential/count.h",
-        "cuda/include/thrust/system/detail/sequential/equal.h",
-        "cuda/include/thrust/system/detail/sequential/execution_policy.h",
-        "cuda/include/thrust/system/detail/sequential/extrema.h",
-        "cuda/include/thrust/system/detail/sequential/fill.h",
-        "cuda/include/thrust/system/detail/sequential/find.h",
-        "cuda/include/thrust/system/detail/sequential/for_each.h",
-        "cuda/include/thrust/system/detail/sequential/gather.h",
-        "cuda/include/thrust/system/detail/sequential/general_copy.h",
-        "cuda/include/thrust/system/detail/sequential/generate.h",
-        "cuda/include/thrust/system/detail/sequential/get_value.h",
-        "cuda/include/thrust/system/detail/sequential/inner_product.h",
-        "cuda/include/thrust/system/detail/sequential/insertion_sort.h",
-        "cuda/include/thrust/system/detail/sequential/iter_swap.h",
-        "cuda/include/thrust/system/detail/sequential/logical.h",
-        "cuda/include/thrust/system/detail/sequential/malloc_and_free.h",
-        "cuda/include/thrust/system/detail/sequential/merge.h",
-        "cuda/include/thrust/system/detail/sequential/merge.inl",
-        "cuda/include/thrust/system/detail/sequential/mismatch.h",
-        "cuda/include/thrust/system/detail/sequential/partition.h",
-        "cuda/include/thrust/system/detail/sequential/reduce.h",
-        "cuda/include/thrust/system/detail/sequential/reduce_by_key.h",
-        "cuda/include/thrust/system/detail/sequential/remove.h",
-        "cuda/include/thrust/system/detail/sequential/replace.h",
-        "cuda/include/thrust/system/detail/sequential/reverse.h",
-        "cuda/include/thrust/system/detail/sequential/scan.h",
-        "cuda/include/thrust/system/detail/sequential/scan_by_key.h",
-        "cuda/include/thrust/system/detail/sequential/scatter.h",
-        "cuda/include/thrust/system/detail/sequential/sequence.h",
-        "cuda/include/thrust/system/detail/sequential/set_operations.h",
-        "cuda/include/thrust/system/detail/sequential/sort.h",
-        "cuda/include/thrust/system/detail/sequential/sort.inl",
-        "cuda/include/thrust/system/detail/sequential/stable_merge_sort.h",
-        "cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl",
-        "cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h",
-        "cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl",
-        "cuda/include/thrust/system/detail/sequential/stable_radix_sort.h",
-        "cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl",
-        "cuda/include/thrust/system/detail/sequential/swap_ranges.h",
-        "cuda/include/thrust/system/detail/sequential/tabulate.h",
-        "cuda/include/thrust/system/detail/sequential/temporary_buffer.h",
-        "cuda/include/thrust/system/detail/sequential/transform.h",
-        "cuda/include/thrust/system/detail/sequential/transform_reduce.h",
-        "cuda/include/thrust/system/detail/sequential/transform_scan.h",
-        "cuda/include/thrust/system/detail/sequential/trivial_copy.h",
-        "cuda/include/thrust/system/detail/sequential/uninitialized_copy.h",
-        "cuda/include/thrust/system/detail/sequential/uninitialized_fill.h",
-        "cuda/include/thrust/system/detail/sequential/unique.h",
-        "cuda/include/thrust/system/detail/sequential/unique_by_key.h",
-        "cuda/include/thrust/system/detail/system_error.inl",
-        "cuda/include/thrust/system/error_code.h",
-        "cuda/include/thrust/system/omp/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/omp/detail/assign_value.h",
-        "cuda/include/thrust/system/omp/detail/binary_search.h",
-        "cuda/include/thrust/system/omp/detail/copy.h",
-        "cuda/include/thrust/system/omp/detail/copy.inl",
-        "cuda/include/thrust/system/omp/detail/copy_if.h",
-        "cuda/include/thrust/system/omp/detail/copy_if.inl",
-        "cuda/include/thrust/system/omp/detail/count.h",
-        "cuda/include/thrust/system/omp/detail/default_decomposition.h",
-        "cuda/include/thrust/system/omp/detail/default_decomposition.inl",
-        "cuda/include/thrust/system/omp/detail/equal.h",
-        "cuda/include/thrust/system/omp/detail/execution_policy.h",
-        "cuda/include/thrust/system/omp/detail/extrema.h",
-        "cuda/include/thrust/system/omp/detail/fill.h",
-        "cuda/include/thrust/system/omp/detail/find.h",
-        "cuda/include/thrust/system/omp/detail/for_each.h",
-        "cuda/include/thrust/system/omp/detail/for_each.inl",
-        "cuda/include/thrust/system/omp/detail/gather.h",
-        "cuda/include/thrust/system/omp/detail/generate.h",
-        "cuda/include/thrust/system/omp/detail/get_value.h",
-        "cuda/include/thrust/system/omp/detail/inner_product.h",
-        "cuda/include/thrust/system/omp/detail/iter_swap.h",
-        "cuda/include/thrust/system/omp/detail/logical.h",
-        "cuda/include/thrust/system/omp/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/omp/detail/memory.inl",
-        "cuda/include/thrust/system/omp/detail/merge.h",
-        "cuda/include/thrust/system/omp/detail/mismatch.h",
-        "cuda/include/thrust/system/omp/detail/par.h",
-        "cuda/include/thrust/system/omp/detail/partition.h",
-        "cuda/include/thrust/system/omp/detail/partition.inl",
-        "cuda/include/thrust/system/omp/detail/reduce.h",
-        "cuda/include/thrust/system/omp/detail/reduce.inl",
-        "cuda/include/thrust/system/omp/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/omp/detail/reduce_by_key.inl",
-        "cuda/include/thrust/system/omp/detail/reduce_intervals.h",
-        "cuda/include/thrust/system/omp/detail/reduce_intervals.inl",
-        "cuda/include/thrust/system/omp/detail/remove.h",
-        "cuda/include/thrust/system/omp/detail/remove.inl",
-        "cuda/include/thrust/system/omp/detail/replace.h",
-        "cuda/include/thrust/system/omp/detail/reverse.h",
-        "cuda/include/thrust/system/omp/detail/scan.h",
-        "cuda/include/thrust/system/omp/detail/scan_by_key.h",
-        "cuda/include/thrust/system/omp/detail/scatter.h",
-        "cuda/include/thrust/system/omp/detail/sequence.h",
-        "cuda/include/thrust/system/omp/detail/set_operations.h",
-        "cuda/include/thrust/system/omp/detail/sort.h",
-        "cuda/include/thrust/system/omp/detail/sort.inl",
-        "cuda/include/thrust/system/omp/detail/swap_ranges.h",
-        "cuda/include/thrust/system/omp/detail/tabulate.h",
-        "cuda/include/thrust/system/omp/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/omp/detail/transform.h",
-        "cuda/include/thrust/system/omp/detail/transform_reduce.h",
-        "cuda/include/thrust/system/omp/detail/transform_scan.h",
-        "cuda/include/thrust/system/omp/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/omp/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/omp/detail/unique.h",
-        "cuda/include/thrust/system/omp/detail/unique.inl",
-        "cuda/include/thrust/system/omp/detail/unique_by_key.h",
-        "cuda/include/thrust/system/omp/detail/unique_by_key.inl",
-        "cuda/include/thrust/system/omp/detail/vector.inl",
-        "cuda/include/thrust/system/omp/execution_policy.h",
-        "cuda/include/thrust/system/omp/memory.h",
-        "cuda/include/thrust/system/omp/vector.h",
-        "cuda/include/thrust/system/system_error.h",
-        "cuda/include/thrust/system/tbb/detail/adjacent_difference.h",
-        "cuda/include/thrust/system/tbb/detail/assign_value.h",
-        "cuda/include/thrust/system/tbb/detail/binary_search.h",
-        "cuda/include/thrust/system/tbb/detail/copy.h",
-        "cuda/include/thrust/system/tbb/detail/copy.inl",
-        "cuda/include/thrust/system/tbb/detail/copy_if.h",
-        "cuda/include/thrust/system/tbb/detail/copy_if.inl",
-        "cuda/include/thrust/system/tbb/detail/count.h",
-        "cuda/include/thrust/system/tbb/detail/equal.h",
-        "cuda/include/thrust/system/tbb/detail/execution_policy.h",
-        "cuda/include/thrust/system/tbb/detail/extrema.h",
-        "cuda/include/thrust/system/tbb/detail/fill.h",
-        "cuda/include/thrust/system/tbb/detail/find.h",
-        "cuda/include/thrust/system/tbb/detail/for_each.h",
-        "cuda/include/thrust/system/tbb/detail/for_each.inl",
-        "cuda/include/thrust/system/tbb/detail/gather.h",
-        "cuda/include/thrust/system/tbb/detail/generate.h",
-        "cuda/include/thrust/system/tbb/detail/get_value.h",
-        "cuda/include/thrust/system/tbb/detail/inner_product.h",
-        "cuda/include/thrust/system/tbb/detail/iter_swap.h",
-        "cuda/include/thrust/system/tbb/detail/logical.h",
-        "cuda/include/thrust/system/tbb/detail/malloc_and_free.h",
-        "cuda/include/thrust/system/tbb/detail/memory.inl",
-        "cuda/include/thrust/system/tbb/detail/merge.h",
-        "cuda/include/thrust/system/tbb/detail/merge.inl",
-        "cuda/include/thrust/system/tbb/detail/mismatch.h",
-        "cuda/include/thrust/system/tbb/detail/par.h",
-        "cuda/include/thrust/system/tbb/detail/partition.h",
-        "cuda/include/thrust/system/tbb/detail/partition.inl",
-        "cuda/include/thrust/system/tbb/detail/reduce.h",
-        "cuda/include/thrust/system/tbb/detail/reduce.inl",
-        "cuda/include/thrust/system/tbb/detail/reduce_by_key.h",
-        "cuda/include/thrust/system/tbb/detail/reduce_by_key.inl",
-        "cuda/include/thrust/system/tbb/detail/reduce_intervals.h",
-        "cuda/include/thrust/system/tbb/detail/remove.h",
-        "cuda/include/thrust/system/tbb/detail/remove.inl",
-        "cuda/include/thrust/system/tbb/detail/replace.h",
-        "cuda/include/thrust/system/tbb/detail/reverse.h",
-        "cuda/include/thrust/system/tbb/detail/scan.h",
-        "cuda/include/thrust/system/tbb/detail/scan.inl",
-        "cuda/include/thrust/system/tbb/detail/scan_by_key.h",
-        "cuda/include/thrust/system/tbb/detail/scatter.h",
-        "cuda/include/thrust/system/tbb/detail/sequence.h",
-        "cuda/include/thrust/system/tbb/detail/set_operations.h",
-        "cuda/include/thrust/system/tbb/detail/sort.h",
-        "cuda/include/thrust/system/tbb/detail/sort.inl",
-        "cuda/include/thrust/system/tbb/detail/swap_ranges.h",
-        "cuda/include/thrust/system/tbb/detail/tabulate.h",
-        "cuda/include/thrust/system/tbb/detail/temporary_buffer.h",
-        "cuda/include/thrust/system/tbb/detail/transform.h",
-        "cuda/include/thrust/system/tbb/detail/transform_reduce.h",
-        "cuda/include/thrust/system/tbb/detail/transform_scan.h",
-        "cuda/include/thrust/system/tbb/detail/uninitialized_copy.h",
-        "cuda/include/thrust/system/tbb/detail/uninitialized_fill.h",
-        "cuda/include/thrust/system/tbb/detail/unique.h",
-        "cuda/include/thrust/system/tbb/detail/unique.inl",
-        "cuda/include/thrust/system/tbb/detail/unique_by_key.h",
-        "cuda/include/thrust/system/tbb/detail/unique_by_key.inl",
-        "cuda/include/thrust/system/tbb/detail/vector.inl",
-        "cuda/include/thrust/system/tbb/execution_policy.h",
-        "cuda/include/thrust/system/tbb/memory.h",
-        "cuda/include/thrust/system/tbb/vector.h",
-        "cuda/include/thrust/system_error.h",
-        "cuda/include/thrust/tabulate.h",
-        "cuda/include/thrust/transform.h",
-        "cuda/include/thrust/transform_reduce.h",
-        "cuda/include/thrust/transform_scan.h",
-        "cuda/include/thrust/tuple.h",
-        "cuda/include/thrust/uninitialized_copy.h",
-        "cuda/include/thrust/uninitialized_fill.h",
-        "cuda/include/thrust/unique.h",
-        "cuda/include/thrust/version.h",
-        "cuda/include/vector_functions.h",
-        "cuda/include/vector_functions.hpp",
-        "cuda/include/vector_types.h",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.0/include/." "$(@D)/cuda/include/" """,
-)
-
-genrule(
-    name = "cuda-nvvm",
-    outs = [
-        "cuda/nvvm/libdevice/libdevice.10.bc",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.0/nvvm/libdevice/." "$(@D)/" """,
-)
-
-genrule(
-    name = "cuda-extras",
-    outs = [
-        "cuda/extras/CUPTI/include/GL/gl.h",
-        "cuda/extras/CUPTI/include/GL/glew.h",
-        "cuda/extras/CUPTI/include/GL/glext.h",
-        "cuda/extras/CUPTI/include/GL/glu.h",
-        "cuda/extras/CUPTI/include/GL/glut.h",
-        "cuda/extras/CUPTI/include/GL/glx.h",
-        "cuda/extras/CUPTI/include/GL/glxext.h",
-        "cuda/extras/CUPTI/include/GL/wglew.h",
-        "cuda/extras/CUPTI/include/GL/wglext.h",
-        "cuda/extras/CUPTI/include/cuda_stdint.h",
-        "cuda/extras/CUPTI/include/cupti.h",
-        "cuda/extras/CUPTI/include/cupti_activity.h",
-        "cuda/extras/CUPTI/include/cupti_callbacks.h",
-        "cuda/extras/CUPTI/include/cupti_driver_cbid.h",
-        "cuda/extras/CUPTI/include/cupti_events.h",
-        "cuda/extras/CUPTI/include/cupti_metrics.h",
-        "cuda/extras/CUPTI/include/cupti_nvtx_cbid.h",
-        "cuda/extras/CUPTI/include/cupti_result.h",
-        "cuda/extras/CUPTI/include/cupti_runtime_cbid.h",
-        "cuda/extras/CUPTI/include/cupti_version.h",
-        "cuda/extras/CUPTI/include/generated_cudaGL_meta.h",
-        "cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h",
-        "cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h",
-        "cuda/extras/CUPTI/include/generated_nvtx_meta.h",
-        "cuda/extras/CUPTI/include/openacc/cupti_openacc.h",
-        "cuda/extras/CUPTI/include/openmp/cupti_openmp.h",
-        "cuda/extras/CUPTI/include/openmp/ompt.h",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.0/extras/CUPTI/include/." "$(@D)/cuda/extras/CUPTI/include/" """,
-)
-
-genrule(
-    name = "cublas-include",
-    outs = [
-        "cublas/include/cublas.h",
-        "cublas/include/cublas_v2.h",
-        "cublas/include/cublas_api.h",
-    ],
-    cmd = """cp -f "/usr/local/cuda-10.0/include/cublas.h" "$(location cublas/include/cublas.h)" && \
-cp -f "/usr/local/cuda-10.0/include/cublas_v2.h" "$(location cublas/include/cublas_v2.h)" && \
-cp -f "/usr/local/cuda-10.0/include/cublas_api.h" "$(location cublas/include/cublas_api.h)" """,
-)
-
-genrule(
-    name = "cuda-lib",
-    outs = [
-        "cuda/lib/libcuda.so",
-        "cuda/lib/libcudart.so.10.0",
-        "cuda/lib/libcudart_static.a",
-        "cuda/lib/libcublas.so.10.0",
-        "cuda/lib/libcusolver.so.10.0",
-        "cuda/lib/libcurand.so.10.0",
-        "cuda/lib/libcufft.so.10.0",
-        "cuda/lib/libcudnn.so.7",
-        "cuda/lib/libcupti.so.10.0",
-        "cuda/lib/libcusparse.so.10.0",
-    ],
-    cmd = """cp -f "/usr/local/cuda-10.0/lib64/stubs/libcuda.so" "$(location cuda/lib/libcuda.so)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcudart.so.10.0" "$(location cuda/lib/libcudart.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcudart_static.a" "$(location cuda/lib/libcudart_static.a)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcublas.so.10.0" "$(location cuda/lib/libcublas.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcusolver.so.10.0" "$(location cuda/lib/libcusolver.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcurand.so.10.0" "$(location cuda/lib/libcurand.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcufft.so.10.0" "$(location cuda/lib/libcufft.so.10.0)" && \
-cp -f "/usr/lib/x86_64-linux-gnu/libcudnn.so.7" "$(location cuda/lib/libcudnn.so.7)" && \
-cp -f "/usr/local/cuda-10.0/extras/CUPTI/lib64/libcupti.so.10.0" "$(location cuda/lib/libcupti.so.10.0)" && \
-cp -f "/usr/local/cuda-10.0/lib64/libcusparse.so.10.0" "$(location cuda/lib/libcusparse.so.10.0)" """,
-)
-
-genrule(
-    name = "cuda-bin",
-    outs = [
-        "cuda/bin/bin2c",
-        "cuda/bin/crt/link.stub",
-        "cuda/bin/crt/prelink.stub",
-        "cuda/bin/cuda-gdb",
-        "cuda/bin/cuda-gdbserver",
-        "cuda/bin/cuda-memcheck",
-        "cuda/bin/cudafe++",
-        "cuda/bin/cuobjdump",
-        "cuda/bin/fatbinary",
-        "cuda/bin/gpu-library-advisor",
-        "cuda/bin/nvcc",
-        "cuda/bin/nvcc.profile",
-        "cuda/bin/nvdisasm",
-        "cuda/bin/nvlink",
-        "cuda/bin/nvprof",
-        "cuda/bin/nvprune",
-        "cuda/bin/ptxas",
-    ],
-    cmd = """cp -rLf "/usr/local/cuda-10.0/bin/." "$(@D)/cuda/bin/" """,
-)
-
-genrule(
-    name = "cudnn-include",
-    outs = [
-        "cudnn/include/cudnn.h",
-    ],
-    cmd = """cp -f "/usr/include/cudnn.h" "$(location cudnn/include/cudnn.h)" """,
-)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl
deleted file mode 100755
index 72472e4c224..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl
+++ /dev/null
@@ -1,64 +0,0 @@
-# Macros for building CUDA code.
-def if_cuda(if_true, if_false = []):
-    """Shorthand for select()'ing on whether we're building with CUDA.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with CUDA enabled.  Otherwise, the select statement evaluates to if_false.
-
-    """
-    return select({
-        "@local_config_cuda//cuda:using_nvcc": if_true,
-        "@local_config_cuda//cuda:using_clang": if_true,
-        "//conditions:default": if_false,
-    })
-
-def cuda_default_copts():
-    """Default options for all CUDA compilations."""
-    return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + [])
-
-def cuda_is_configured():
-    """Returns true if CUDA was enabled during the configure process."""
-    return True
-
-def if_cuda_is_configured(x):
-    """Tests if the CUDA was enabled during the configure process.
-
-    Unlike if_cuda(), this does not require that we are building with
-    --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries.
-    """
-    if cuda_is_configured():
-        return select({"//conditions:default": x})
-    return select({"//conditions:default": []})
-
-def cuda_header_library(
-        name,
-        hdrs,
-        include_prefix = None,
-        strip_include_prefix = None,
-        deps = [],
-        **kwargs):
-    """Generates a cc_library containing both virtual and system include paths.
-
-    Generates both a header-only target with virtual includes plus the full
-    target without virtual includes. This works around the fact that bazel can't
-    mix 'includes' and 'include_prefix' in the same target."""
-
-    native.cc_library(
-        name = name + "_virtual",
-        hdrs = hdrs,
-        include_prefix = include_prefix,
-        strip_include_prefix = strip_include_prefix,
-        deps = deps,
-        visibility = ["//visibility:private"],
-    )
-
-    native.cc_library(
-        name = name,
-        textual_hdrs = hdrs,
-        deps = deps + [":%s_virtual" % name],
-        **kwargs
-    )
-
-def cuda_library(copts = [], **kwargs):
-    """Wrapper over cc_library which adds default CUDA options."""
-    native.cc_library(copts = cuda_default_copts() + copts, **kwargs)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
deleted file mode 100644
index 72a7cf77346..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef CUDA_CUDA_CONFIG_H_
-#define CUDA_CUDA_CONFIG_H_
-
-#define TF_CUDA_CAPABILITIES CudaVersion("3.0"), CudaVersion("6.0")
-
-#define TF_CUDA_VERSION "10.0"
-#define TF_CUDA_LIB_VERSION "10.0"
-#define TF_CUDNN_VERSION "7"
-
-#define TF_CUDA_TOOLKIT_PATH "/usr/local/cuda-10.0"
-
-#endif  // CUDA_CUDA_CONFIG_H_
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
deleted file mode 100755
index 399efccfdad..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
+++ /dev/null
@@ -1,172 +0,0 @@
-# This file is expanded from a template by cuda_configure.bzl
-# Update cuda_configure.bzl#verify_build_defines when adding new variables.
-
-load(":cc_toolchain_config.bzl", "cc_toolchain_config")
-
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-toolchain(
-    name = "toolchain-linux-x86_64",
-    exec_compatible_with = [
-        "@bazel_tools//platforms:linux",
-        "@bazel_tools//platforms:x86_64",
-    ],
-    target_compatible_with = [
-        "@bazel_tools//platforms:linux",
-        "@bazel_tools//platforms:x86_64",
-    ],
-    toolchain = ":cc-compiler-local",
-    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
-)
-
-cc_toolchain_suite(
-    name = "toolchain",
-    toolchains = {
-        "local|compiler": ":cc-compiler-local",
-        "darwin|compiler": ":cc-compiler-darwin",
-        "x64_windows|msvc-cl": ":cc-compiler-windows",
-        "x64_windows": ":cc-compiler-windows",
-        "arm": ":cc-compiler-local",
-        "k8": ":cc-compiler-local",
-        "piii": ":cc-compiler-local",
-        "ppc": ":cc-compiler-local",
-        "darwin": ":cc-compiler-darwin",
-    },
-)
-
-cc_toolchain(
-    name = "cc-compiler-local",
-    all_files = ":crosstool_wrapper_driver_is_not_gcc",
-    compiler_files = ":empty",
-    dwp_files = ":empty",
-    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    # To support linker flags that need to go to the start of command line
-    # we need the toolchain to support parameter files. Parameter files are
-    # last on the command line and contain all shared libraries to link, so all
-    # regular options will be left of them.
-    supports_param_files = 1,
-    toolchain_config = ":cc-compiler-local-config",
-    toolchain_identifier = "local_linux",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-local-config",
-    builtin_include_directories = [
-        "/usr/include/c++/4.8",
-        "/usr/include/x86_64-linux-gnu/c++/4.8",
-        "/usr/include/c++/4.8/backward",
-        "/usr/lib/gcc/x86_64-linux-gnu/4.8/include",
-        "/usr/local/include",
-        "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed",
-        "/usr/include/x86_64-linux-gnu",
-        "/usr/include",
-        "/usr/local/cuda-10.0/targets/x86_64-linux/include",
-        "/usr/local/cuda-10.0/include",
-        "/usr/local/cuda-10.0/extras/CUPTI/include",
-        "/usr/include",
-    ],
-    cpu = "local",
-    extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
-    host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc",
-    host_compiler_prefix = "/usr/bin",
-    host_compiler_warnings = [],
-    host_unfiltered_compile_flags = [],
-    linker_bin_path = "/usr/bin",
-)
-
-cc_toolchain(
-    name = "cc-compiler-darwin",
-    all_files = ":crosstool_wrapper_driver_is_not_gcc",
-    compiler_files = ":empty",
-    dwp_files = ":empty",
-    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 0,
-    toolchain_config = ":cc-compiler-local-darwin",
-    toolchain_identifier = "local_darwin",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-local-darwin",
-    builtin_include_directories = [
-        "/usr/include/c++/4.8",
-        "/usr/include/x86_64-linux-gnu/c++/4.8",
-        "/usr/include/c++/4.8/backward",
-        "/usr/lib/gcc/x86_64-linux-gnu/4.8/include",
-        "/usr/local/include",
-        "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed",
-        "/usr/include/x86_64-linux-gnu",
-        "/usr/include",
-        "/usr/local/cuda-10.0/targets/x86_64-linux/include",
-        "/usr/local/cuda-10.0/include",
-        "/usr/local/cuda-10.0/extras/CUPTI/include",
-        "/usr/include",
-    ],
-    cpu = "darwin",
-    extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
-    host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc",
-    host_compiler_prefix = "/usr/bin",
-    host_compiler_warnings = [],
-    host_unfiltered_compile_flags = [],
-    linker_bin_path = "/usr/bin",
-)
-
-cc_toolchain(
-    name = "cc-compiler-windows",
-    all_files = ":windows_msvc_wrapper_files",
-    compiler_files = ":empty",
-    dwp_files = ":empty",
-    linker_files = ":windows_msvc_wrapper_files",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 1,
-    toolchain_config = ":cc-compiler-windows-config",
-    toolchain_identifier = "local_windows",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-windows-config",
-    builtin_include_directories = [
-        "/usr/include/c++/4.8",
-        "/usr/include/x86_64-linux-gnu/c++/4.8",
-        "/usr/include/c++/4.8/backward",
-        "/usr/lib/gcc/x86_64-linux-gnu/4.8/include",
-        "/usr/local/include",
-        "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed",
-        "/usr/include/x86_64-linux-gnu",
-        "/usr/include",
-        "/usr/local/cuda-10.0/targets/x86_64-linux/include",
-        "/usr/local/cuda-10.0/include",
-        "/usr/local/cuda-10.0/extras/CUPTI/include",
-        "/usr/include",
-    ],
-    cpu = "x64_windows",
-    msvc_cl_path = "msvc_not_used",
-    msvc_env_include = "msvc_not_used",
-    msvc_env_lib = "msvc_not_used",
-    msvc_env_path = "msvc_not_used",
-    msvc_env_tmp = "msvc_not_used",
-    msvc_lib_path = "msvc_not_used",
-    msvc_link_path = "msvc_not_used",
-    msvc_ml_path = "msvc_not_used",
-)
-
-filegroup(
-    name = "empty",
-    srcs = [],
-)
-
-filegroup(
-    name = "crosstool_wrapper_driver_is_not_gcc",
-    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
-)
-
-filegroup(
-    name = "windows_msvc_wrapper_files",
-    srcs = glob(["windows/msvc_*"]),
-)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/cc_toolchain_config.bzl b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/cc_toolchain_config.bzl
deleted file mode 100755
index f7575bbe28e..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/cc_toolchain_config.bzl
+++ /dev/null
@@ -1,1485 +0,0 @@
-"""cc_toolchain_config rule for configuring CUDA toolchains on Linux, Mac, and Windows."""
-
-load(
-    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
-    "action_config",
-    "env_entry",
-    "env_set",
-    "feature",
-    "feature_set",
-    "flag_group",
-    "flag_set",
-    "tool",
-    "tool_path",
-    "variable_with_value",
-)
-load(
-    "@bazel_tools//tools/build_defs/cc:action_names.bzl",
-    "ASSEMBLE_ACTION_NAME",
-    "CC_FLAGS_MAKE_VARIABLE_ACTION_NAME",
-    "CLIF_MATCH_ACTION_NAME",
-    "CPP_COMPILE_ACTION_NAME",
-    "CPP_HEADER_PARSING_ACTION_NAME",
-    "CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME",
-    "CPP_LINK_EXECUTABLE_ACTION_NAME",
-    "CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME",
-    "CPP_LINK_STATIC_LIBRARY_ACTION_NAME",
-    "CPP_MODULE_CODEGEN_ACTION_NAME",
-    "CPP_MODULE_COMPILE_ACTION_NAME",
-    "C_COMPILE_ACTION_NAME",
-    "LINKSTAMP_COMPILE_ACTION_NAME",
-    "LTO_BACKEND_ACTION_NAME",
-    "LTO_INDEXING_ACTION_NAME",
-    "OBJCPP_COMPILE_ACTION_NAME",
-    "OBJCPP_EXECUTABLE_ACTION_NAME",
-    "OBJC_ARCHIVE_ACTION_NAME",
-    "OBJC_COMPILE_ACTION_NAME",
-    "OBJC_EXECUTABLE_ACTION_NAME",
-    "OBJC_FULLY_LINK_ACTION_NAME",
-    "PREPROCESS_ASSEMBLE_ACTION_NAME",
-    "STRIP_ACTION_NAME",
-)
-
-ACTION_NAMES = struct(
-    c_compile = C_COMPILE_ACTION_NAME,
-    cpp_compile = CPP_COMPILE_ACTION_NAME,
-    linkstamp_compile = LINKSTAMP_COMPILE_ACTION_NAME,
-    cc_flags_make_variable = CC_FLAGS_MAKE_VARIABLE_ACTION_NAME,
-    cpp_module_codegen = CPP_MODULE_CODEGEN_ACTION_NAME,
-    cpp_header_parsing = CPP_HEADER_PARSING_ACTION_NAME,
-    cpp_module_compile = CPP_MODULE_COMPILE_ACTION_NAME,
-    assemble = ASSEMBLE_ACTION_NAME,
-    preprocess_assemble = PREPROCESS_ASSEMBLE_ACTION_NAME,
-    lto_indexing = LTO_INDEXING_ACTION_NAME,
-    lto_backend = LTO_BACKEND_ACTION_NAME,
-    cpp_link_executable = CPP_LINK_EXECUTABLE_ACTION_NAME,
-    cpp_link_dynamic_library = CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME,
-    cpp_link_nodeps_dynamic_library = CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME,
-    cpp_link_static_library = CPP_LINK_STATIC_LIBRARY_ACTION_NAME,
-    strip = STRIP_ACTION_NAME,
-    objc_archive = OBJC_ARCHIVE_ACTION_NAME,
-    objc_compile = OBJC_COMPILE_ACTION_NAME,
-    objc_executable = OBJC_EXECUTABLE_ACTION_NAME,
-    objc_fully_link = OBJC_FULLY_LINK_ACTION_NAME,
-    objcpp_compile = OBJCPP_COMPILE_ACTION_NAME,
-    objcpp_executable = OBJCPP_EXECUTABLE_ACTION_NAME,
-    clif_match = CLIF_MATCH_ACTION_NAME,
-    objcopy_embed_data = "objcopy_embed_data",
-    ld_embed_data = "ld_embed_data",
-)
-
-def _impl(ctx):
-    if (ctx.attr.cpu == "darwin"):
-        toolchain_identifier = "local_darwin"
-    elif (ctx.attr.cpu == "local"):
-        toolchain_identifier = "local_linux"
-    elif (ctx.attr.cpu == "x64_windows"):
-        toolchain_identifier = "local_windows"
-    else:
-        fail("Unreachable")
-
-    host_system_name = "local"
-
-    target_system_name = "local"
-
-    if (ctx.attr.cpu == "darwin"):
-        target_cpu = "darwin"
-    elif (ctx.attr.cpu == "local"):
-        target_cpu = "local"
-    elif (ctx.attr.cpu == "x64_windows"):
-        target_cpu = "x64_windows"
-    else:
-        fail("Unreachable")
-
-    if (ctx.attr.cpu == "local"):
-        target_libc = "local"
-    elif (ctx.attr.cpu == "darwin"):
-        target_libc = "macosx"
-    elif (ctx.attr.cpu == "x64_windows"):
-        target_libc = "msvcrt"
-    else:
-        fail("Unreachable")
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        compiler = "compiler"
-    elif (ctx.attr.cpu == "x64_windows"):
-        compiler = "msvc-cl"
-    else:
-        fail("Unreachable")
-
-    abi_version = "local"
-
-    abi_libc_version = "local"
-
-    cc_target_os = None
-
-    builtin_sysroot = None
-
-    all_link_actions = [
-        ACTION_NAMES.cpp_link_executable,
-        ACTION_NAMES.cpp_link_dynamic_library,
-        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-    ]
-
-    cpp_link_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    cpp_link_nodeps_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    cpp_link_static_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_static_library,
-        implies = [
-            "nologo",
-            "archiver_flags",
-            "input_param_flags",
-            "linker_param_file",
-            "msvc_env",
-        ],
-        tools = [tool(path = ctx.attr.msvc_lib_path)],
-    )
-
-    assemble_action = action_config(
-        action_name = ACTION_NAMES.assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = ctx.attr.msvc_ml_path)],
-    )
-
-    preprocess_assemble_action = action_config(
-        action_name = ACTION_NAMES.preprocess_assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = ctx.attr.msvc_ml_path)],
-    )
-
-    c_compile_action = action_config(
-        action_name = ACTION_NAMES.c_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = ctx.attr.msvc_cl_path)],
-    )
-
-    cpp_compile_action = action_config(
-        action_name = ACTION_NAMES.cpp_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = ctx.attr.msvc_cl_path)],
-    )
-
-    cpp_link_executable_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_executable,
-        implies = [
-            "nologo",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        action_configs = []
-    elif (ctx.attr.cpu == "x64_windows"):
-        action_configs = [
-            assemble_action,
-            preprocess_assemble_action,
-            c_compile_action,
-            cpp_compile_action,
-            cpp_link_executable_action,
-            cpp_link_dynamic_library_action,
-            cpp_link_nodeps_dynamic_library_action,
-            cpp_link_static_library_action,
-        ]
-    else:
-        fail("Unreachable")
-
-    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
-
-    pic_feature = feature(
-        name = "pic",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(flags = ["-fPIC"], expand_if_available = "pic"),
-                    flag_group(
-                        flags = ["-fPIE"],
-                        expand_if_not_available = "pic",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    preprocessor_defines_feature = feature(
-        name = "preprocessor_defines",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/D%{preprocessor_defines}"],
-                        iterate_over = "preprocessor_defines",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    generate_pdb_file_feature = feature(
-        name = "generate_pdb_file",
-        requires = [
-            feature_set(features = ["dbg"]),
-            feature_set(features = ["fastbuild"]),
-        ],
-    )
-
-    linkstamps_feature = feature(
-        name = "linkstamps",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{linkstamp_paths}"],
-                        iterate_over = "linkstamp_paths",
-                        expand_if_available = "linkstamp_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    unfiltered_compile_flags_feature = feature(
-        name = "unfiltered_compile_flags",
-        flag_sets = ([
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ctx.attr.host_unfiltered_compile_flags,
-                    ),
-                ],
-            ),
-        ] if ctx.attr.host_unfiltered_compile_flags else []),
-    )
-
-    determinism_feature = feature(
-        name = "determinism",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-Wno-builtin-macro-redefined",
-                            "-D__DATE__=\"redacted\"",
-                            "-D__TIMESTAMP__=\"redacted\"",
-                            "-D__TIME__=\"redacted\"",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    nologo_feature = feature(
-        name = "nologo",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                flag_groups = [flag_group(flags = ["/nologo"])],
-            ),
-        ],
-    )
-
-    supports_pic_feature = feature(name = "supports_pic", enabled = True)
-
-    output_execpath_flags_feature = feature(
-        name = "output_execpath_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    default_link_flags_feature = feature(
-        name = "default_link_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        hardening_feature = feature(
-            name = "hardening",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-U_FORTIFY_SOURCE",
-                                "-D_FORTIFY_SOURCE=1",
-                                "-fstack-protector",
-                            ],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [flag_group(flags = ["-Wl,-z,relro,-z,now"])],
-                ),
-                flag_set(
-                    actions = [ACTION_NAMES.cpp_link_executable],
-                    flag_groups = [flag_group(flags = ["-pie", "-Wl,-z,relro,-z,now"])],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        hardening_feature = feature(
-            name = "hardening",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-U_FORTIFY_SOURCE",
-                                "-D_FORTIFY_SOURCE=1",
-                                "-fstack-protector",
-                            ],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [ACTION_NAMES.cpp_link_executable],
-                    flag_groups = [flag_group(flags = ["-pie"])],
-                ),
-            ],
-        )
-    else:
-        hardening_feature = None
-
-    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
-
-    targets_windows_feature = feature(
-        name = "targets_windows",
-        enabled = True,
-        implies = ["copy_dynamic_libraries_to_binary"],
-    )
-
-    msvc_env_feature = feature(
-        name = "msvc_env",
-        env_sets = [
-            env_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                env_entries = [
-                    env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
-                    env_entry(
-                        key = "INCLUDE",
-                        value = ctx.attr.msvc_env_include,
-                    ),
-                    env_entry(key = "LIB", value = ctx.attr.msvc_env_lib),
-                    env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
-                    env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
-                ],
-            ),
-        ],
-    )
-
-    linker_subsystem_flag_feature = feature(
-        name = "linker_subsystem_flag",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_no_debug_feature = feature(
-        name = "dynamic_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MD"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    warnings_feature = feature(
-        name = "warnings",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wall"] + ctx.attr.host_compiler_warnings,
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_debug_feature = feature(
-        name = "dynamic_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MDd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    compiler_output_flags_feature = feature(
-        name = "compiler_output_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.assemble],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}", "/Zi"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fa%{output_file}"],
-                                expand_if_available = "output_assembly_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/P", "/Fi%{output_file}"],
-                                expand_if_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    default_compile_flags_feature = feature(
-        name = "default_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "/DCOMPILER_MSVC",
-                            "/DNOMINMAX",
-                            "/D_WIN32_WINNT=0x0600",
-                            "/D_CRT_SECURE_NO_DEPRECATE",
-                            "/D_CRT_SECURE_NO_WARNINGS",
-                            "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS",
-                            "/bigobj",
-                            "/Zm500",
-                            "/J",
-                            "/Gy",
-                            "/GF",
-                            "/EHsc",
-                            "/wd4351",
-                            "/wd4291",
-                            "/wd4250",
-                            "/wd4996",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_debug_feature = feature(
-        name = "static_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MTd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        dbg_feature = feature(
-            name = "dbg",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["-g"])],
-                ),
-            ],
-            implies = ["common"],
-        )
-    elif (ctx.attr.cpu == "x64_windows"):
-        dbg_feature = feature(
-            name = "dbg",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                ),
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])],
-                ),
-            ],
-            implies = ["generate_pdb_file"],
-        )
-    else:
-        dbg_feature = None
-
-    undefined_dynamic_feature = feature(
-        name = "undefined-dynamic",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
-            ),
-        ],
-    )
-
-    parse_showincludes_feature = feature(
-        name = "parse_showincludes",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                ],
-                flag_groups = [flag_group(flags = ["/showIncludes"])],
-            ),
-        ],
-    )
-
-    linker_param_file_feature = feature(
-        name = "linker_param_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["@%{linker_param_file}"],
-                        expand_if_available = "linker_param_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_no_debug_feature = feature(
-        name = "static_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MT"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    supports_interface_shared_libraries_feature = feature(
-        name = "supports_interface_shared_libraries",
-        enabled = True,
-    )
-
-    disable_assertions_feature = feature(
-        name = "disable-assertions",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-DNDEBUG"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "x64_windows"):
-        fastbuild_feature = feature(
-            name = "fastbuild",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                ),
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [
-                        flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]),
-                    ],
-                ),
-            ],
-            implies = ["generate_pdb_file"],
-        )
-    elif (ctx.attr.cpu == "darwin" or
-          ctx.attr.cpu == "local"):
-        fastbuild_feature = feature(name = "fastbuild", implies = ["common"])
-    else:
-        fastbuild_feature = None
-
-    user_compile_flags_feature = feature(
-        name = "user_compile_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_compile_flags}"],
-                        iterate_over = "user_compile_flags",
-                        expand_if_available = "user_compile_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    compiler_input_flags_feature = feature(
-        name = "compiler_input_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/c", "%{source_file}"],
-                        expand_if_available = "source_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    no_legacy_features_feature = feature(name = "no_legacy_features")
-
-    archiver_flags_feature = feature(
-        name = "archiver_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    redirector_feature = feature(
-        name = "redirector",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-B",
-                            "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    linker_bin_path_feature = feature(
-        name = "linker-bin-path",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["-B" + ctx.attr.linker_bin_path])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                        ACTION_NAMES.cpp_link_executable,
-                    ],
-                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
-                ),
-            ],
-            implies = ["common", "disable-assertions"],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
-                        ),
-                    ],
-                ),
-            ],
-            implies = ["common", "disable-assertions"],
-        )
-    elif (ctx.attr.cpu == "x64_windows"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])],
-                ),
-            ],
-        )
-    else:
-        opt_feature = None
-
-    include_paths_feature = feature(
-        name = "include_paths",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/I%{quote_include_paths}"],
-                        iterate_over = "quote_include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{include_paths}"],
-                        iterate_over = "include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{system_include_paths}"],
-                        iterate_over = "system_include_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    shared_flag_feature = feature(
-        name = "shared_flag",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [flag_group(flags = ["/DLL"])],
-            ),
-        ],
-    )
-
-    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
-
-    frame_pointer_feature = feature(
-        name = "frame-pointer",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-fno-omit-frame-pointer"])],
-            ),
-        ],
-    )
-
-    build_id_feature = feature(
-        name = "build-id",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    sysroot_feature = feature(
-        name = "sysroot",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["--sysroot=%{sysroot}"],
-                        iterate_over = "sysroot",
-                        expand_if_available = "sysroot",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    def_file_feature = feature(
-        name = "def_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
-                        expand_if_available = "def_file_path",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "darwin"):
-        stdlib_feature = feature(
-            name = "stdlib",
-            flag_sets = [
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["-lc++"])],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "local"):
-        stdlib_feature = feature(
-            name = "stdlib",
-            flag_sets = [
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["-lstdc++"])],
-                ),
-            ],
-        )
-    else:
-        stdlib_feature = None
-
-    no_stripping_feature = feature(name = "no_stripping")
-
-    alwayslink_feature = feature(
-        name = "alwayslink",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [flag_group(flags = ["-Wl,-no-as-needed"])],
-            ),
-        ],
-    )
-
-    input_param_flags_feature = feature(
-        name = "input_param_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/IMPLIB:%{interface_library_output_path}"],
-                        expand_if_available = "interface_library_output_path",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        iterate_over = "libraries_to_link",
-                        flag_groups = [
-                            flag_group(
-                                iterate_over = "libraries_to_link.object_files",
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file_group",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "interface_library",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["%{libraries_to_link.name}"],
-                                        expand_if_false = "libraries_to_link.is_whole_archive",
-                                    ),
-                                    flag_group(
-                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
-                                        expand_if_true = "libraries_to_link.is_whole_archive",
-                                    ),
-                                ],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "static_library",
-                                ),
-                            ),
-                        ],
-                        expand_if_available = "libraries_to_link",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        no_canonical_prefixes_feature = feature(
-            name = "no-canonical-prefixes",
-            flag_sets = [
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.c_compile,
-                        ACTION_NAMES.cpp_compile,
-                        ACTION_NAMES.cpp_link_executable,
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-no-canonical-prefixes",
-                            ] + ctx.attr.extra_no_canonical_prefixes_flags,
-                        ),
-                    ],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        no_canonical_prefixes_feature = feature(
-            name = "no-canonical-prefixes",
-            flag_sets = [
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.c_compile,
-                        ACTION_NAMES.cpp_compile,
-                        ACTION_NAMES.cpp_link_executable,
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [flag_group(flags = ["-no-canonical-prefixes"])],
-                ),
-            ],
-        )
-    else:
-        no_canonical_prefixes_feature = None
-
-    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
-
-    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
-
-    user_link_flags_feature = feature(
-        name = "user_link_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_link_flags}"],
-                        iterate_over = "user_link_flags",
-                        expand_if_available = "user_link_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    cpp11_feature = feature(
-        name = "c++11",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-std=c++11"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        common_feature = feature(
-            name = "common",
-            implies = [
-                "stdlib",
-                "c++11",
-                "determinism",
-                "alwayslink",
-                "hardening",
-                "warnings",
-                "frame-pointer",
-                "build-id",
-                "no-canonical-prefixes",
-                "linker-bin-path",
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        common_feature = feature(
-            name = "common",
-            implies = [
-                "stdlib",
-                "c++11",
-                "determinism",
-                "hardening",
-                "warnings",
-                "frame-pointer",
-                "no-canonical-prefixes",
-                "linker-bin-path",
-                "undefined-dynamic",
-            ],
-        )
-    else:
-        common_feature = None
-
-    if (ctx.attr.cpu == "local"):
-        features = [
-            cpp11_feature,
-            stdlib_feature,
-            determinism_feature,
-            alwayslink_feature,
-            pic_feature,
-            hardening_feature,
-            warnings_feature,
-            frame_pointer_feature,
-            build_id_feature,
-            no_canonical_prefixes_feature,
-            disable_assertions_feature,
-            linker_bin_path_feature,
-            common_feature,
-            opt_feature,
-            fastbuild_feature,
-            dbg_feature,
-            supports_dynamic_linker_feature,
-            supports_pic_feature,
-        ]
-    elif (ctx.attr.cpu == "darwin"):
-        features = [
-            cpp11_feature,
-            stdlib_feature,
-            determinism_feature,
-            pic_feature,
-            hardening_feature,
-            warnings_feature,
-            frame_pointer_feature,
-            no_canonical_prefixes_feature,
-            disable_assertions_feature,
-            linker_bin_path_feature,
-            undefined_dynamic_feature,
-            common_feature,
-            opt_feature,
-            fastbuild_feature,
-            dbg_feature,
-            supports_dynamic_linker_feature,
-            supports_pic_feature,
-        ]
-    elif (ctx.attr.cpu == "x64_windows"):
-        features = [
-            no_legacy_features_feature,
-            redirector_feature,
-            nologo_feature,
-            has_configured_linker_path_feature,
-            no_stripping_feature,
-            targets_windows_feature,
-            copy_dynamic_libraries_to_binary_feature,
-            default_compile_flags_feature,
-            msvc_env_feature,
-            include_paths_feature,
-            preprocessor_defines_feature,
-            parse_showincludes_feature,
-            generate_pdb_file_feature,
-            shared_flag_feature,
-            linkstamps_feature,
-            output_execpath_flags_feature,
-            archiver_flags_feature,
-            input_param_flags_feature,
-            linker_subsystem_flag_feature,
-            user_link_flags_feature,
-            default_link_flags_feature,
-            linker_param_file_feature,
-            static_link_msvcrt_feature,
-            static_link_msvcrt_no_debug_feature,
-            dynamic_link_msvcrt_no_debug_feature,
-            static_link_msvcrt_debug_feature,
-            dynamic_link_msvcrt_debug_feature,
-            dbg_feature,
-            fastbuild_feature,
-            opt_feature,
-            user_compile_flags_feature,
-            sysroot_feature,
-            unfiltered_compile_flags_feature,
-            compiler_output_flags_feature,
-            compiler_input_flags_feature,
-            def_file_feature,
-            windows_export_all_symbols_feature,
-            no_windows_export_all_symbols_feature,
-            supports_dynamic_linker_feature,
-            supports_interface_shared_libraries_feature,
-        ]
-    else:
-        fail("Unreachable")
-
-    cxx_builtin_include_directories = ctx.attr.builtin_include_directories
-
-    if (ctx.attr.cpu == "x64_windows"):
-        tool_paths = [
-            tool_path(name = "ar", path = ctx.attr.msvc_lib_path),
-            tool_path(name = "ml", path = ctx.attr.msvc_ml_path),
-            tool_path(name = "cpp", path = ctx.attr.msvc_cl_path),
-            tool_path(name = "gcc", path = ctx.attr.msvc_cl_path),
-            tool_path(name = "gcov", path = "wrapper/bin/msvc_nop.bat"),
-            tool_path(name = "ld", path = ctx.attr.msvc_link_path),
-            tool_path(name = "nm", path = "wrapper/bin/msvc_nop.bat"),
-            tool_path(
-                name = "objcopy",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-            tool_path(
-                name = "objdump",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-            tool_path(
-                name = "strip",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-        ]
-    elif (ctx.attr.cpu == "local"):
-        tool_paths = [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/ar"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
-        ]
-    elif (ctx.attr.cpu == "darwin"):
-        tool_paths = [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/libtool"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
-        ]
-    else:
-        fail("Unreachable")
-
-    out = ctx.actions.declare_file(ctx.label.name)
-    ctx.actions.write(out, "Fake executable")
-    return [
-        cc_common.create_cc_toolchain_config_info(
-            ctx = ctx,
-            features = features,
-            action_configs = action_configs,
-            artifact_name_patterns = [],
-            cxx_builtin_include_directories = cxx_builtin_include_directories,
-            toolchain_identifier = toolchain_identifier,
-            host_system_name = host_system_name,
-            target_system_name = target_system_name,
-            target_cpu = target_cpu,
-            target_libc = target_libc,
-            compiler = compiler,
-            abi_version = abi_version,
-            abi_libc_version = abi_libc_version,
-            tool_paths = tool_paths,
-            make_variables = [],
-            builtin_sysroot = builtin_sysroot,
-            cc_target_os = cc_target_os,
-        ),
-        DefaultInfo(
-            executable = out,
-        ),
-    ]
-
-cc_toolchain_config = rule(
-    implementation = _impl,
-    attrs = {
-        "cpu": attr.string(mandatory = True, values = ["darwin", "local", "x64_windows"]),
-        "builtin_include_directories": attr.string_list(),
-        "extra_no_canonical_prefixes_flags": attr.string_list(),
-        "host_compiler_path": attr.string(),
-        "host_compiler_prefix": attr.string(),
-        "host_compiler_warnings": attr.string_list(),
-        "host_unfiltered_compile_flags": attr.string_list(),
-        "linker_bin_path": attr.string(),
-        "msvc_cl_path": attr.string(default = "msvc_not_used"),
-        "msvc_env_include": attr.string(default = "msvc_not_used"),
-        "msvc_env_lib": attr.string(default = "msvc_not_used"),
-        "msvc_env_path": attr.string(default = "msvc_not_used"),
-        "msvc_env_tmp": attr.string(default = "msvc_not_used"),
-        "msvc_lib_path": attr.string(default = "msvc_not_used"),
-        "msvc_link_path": attr.string(default = "msvc_not_used"),
-        "msvc_ml_path": attr.string(default = "msvc_not_used"),
-    },
-    provides = [CcToolchainConfigInfo],
-    executable = True,
-)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
deleted file mode 100755
index c49b20f2eb9..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
+++ /dev/null
@@ -1,264 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Crosstool wrapper for compiling CUDA programs.
-
-SYNOPSIS:
-  crosstool_wrapper_is_not_gcc [options passed in by cc_library()
-                                or cc_binary() rule]
-
-DESCRIPTION:
-  This script is expected to be called by the cc_library() or cc_binary() bazel
-  rules. When the option "-x cuda" is present in the list of arguments passed
-  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
-  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
-  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
-  arguments as is.
-
-NOTES:
-  Changes to the contents of this file must be propagated from
-  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc to
-  //third_party/gpus/crosstool/v*/*/clang/bin/crosstool_wrapper_is_not_gcc
-"""
-
-from __future__ import print_function
-
-__author__ = 'keveman@google.com (Manjunath Kudlur)'
-
-from argparse import ArgumentParser
-import os
-import subprocess
-import re
-import sys
-import pipes
-
-# Template values set by cuda_autoconf.
-CPU_COMPILER = ('/usr/bin/gcc')
-GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
-
-NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc'
-PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
-NVCC_VERSION = '10.0'
-
-def Log(s):
-  print('gpus/crosstool: {0}'.format(s))
-
-
-def GetOptionValue(argv, option):
-  """Extract the list of values for option from the argv list.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    option: The option whose value to extract, without the leading '-'.
-
-  Returns:
-    A list of values, either directly following the option,
-    (eg., -opt val1 val2) or values collected from multiple occurrences of
-    the option (eg., -opt val1 -opt val2).
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-' + option, nargs='*', action='append')
-  args, _ = parser.parse_known_args(argv)
-  if not args or not vars(args)[option]:
-    return []
-  else:
-    return sum(vars(args)[option], [])
-
-
-def GetHostCompilerOptions(argv):
-  """Collect the -isystem, -iquote, and --sysroot option values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    The string that can be used as the --compiler-options to nvcc.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-isystem', nargs='*', action='append')
-  parser.add_argument('-iquote', nargs='*', action='append')
-  parser.add_argument('--sysroot', nargs=1)
-  parser.add_argument('-g', nargs='*', action='append')
-  parser.add_argument('-fno-canonical-system-headers', action='store_true')
-
-  args, _ = parser.parse_known_args(argv)
-
-  opts = ''
-
-  if args.isystem:
-    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
-  if args.iquote:
-    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
-  if args.g:
-    opts += ' -g' + ' -g'.join(sum(args.g, []))
-  if args.fno_canonical_system_headers:
-    opts += ' -fno-canonical-system-headers'
-  if args.sysroot:
-    opts += ' --sysroot ' + args.sysroot[0]
-
-  return opts
-
-def _update_options(nvcc_options):
-  if NVCC_VERSION in ("7.0",):
-    return nvcc_options
-
-  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
-  return [ update_options[opt] if opt in update_options else opt
-                    for opt in nvcc_options ]
-
-def GetNvccOptions(argv):
-  """Collect the -nvcc_options values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    The string that can be passed directly to nvcc.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-nvcc_options', nargs='*', action='append')
-
-  args, _ = parser.parse_known_args(argv)
-
-  if args.nvcc_options:
-    options = _update_options(sum(args.nvcc_options, []))
-    return ' '.join(['--'+a for a in options])
-  return ''
-
-
-def InvokeNvcc(argv, log=False):
-  """Call nvcc with arguments assembled from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    log: True if logging is requested.
-
-  Returns:
-    The return value of calling os.system('nvcc ' + args)
-  """
-
-  host_compiler_options = GetHostCompilerOptions(argv)
-  nvcc_compiler_options = GetNvccOptions(argv)
-  opt_option = GetOptionValue(argv, 'O')
-  m_options = GetOptionValue(argv, 'm')
-  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
-  include_options = GetOptionValue(argv, 'I')
-  out_file = GetOptionValue(argv, 'o')
-  depfiles = GetOptionValue(argv, 'MF')
-  defines = GetOptionValue(argv, 'D')
-  defines = ''.join([' -D' + define for define in defines])
-  undefines = GetOptionValue(argv, 'U')
-  undefines = ''.join([' -U' + define for define in undefines])
-  std_options = GetOptionValue(argv, 'std')
-  # currently only c++11 is supported by Cuda 7.0 std argument
-  nvcc_allowed_std_options = ["c++11"]
-  std_options = ''.join([' -std=' + define
-      for define in std_options if define in nvcc_allowed_std_options])
-
-  # The list of source files get passed after the -c option. I don't know of
-  # any other reliable way to just get the list of source files to be compiled.
-  src_files = GetOptionValue(argv, 'c')
-
-  # Pass -w through from host to nvcc, but don't do anything fancier with
-  # warnings-related flags, since they're not necessarily the same across
-  # compilers.
-  warning_options = ' -w' if '-w' in argv else ''
-
-  if len(src_files) == 0:
-    return 1
-  if len(out_file) != 1:
-    return 1
-
-  opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0)
-         else ' -g -G')
-
-  includes = (' -I ' + ' -I '.join(include_options)
-              if len(include_options) > 0
-              else '')
-
-  # Unfortunately, there are other options that have -c prefix too.
-  # So allowing only those look like C/C++ files.
-  src_files = [f for f in src_files if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
-  srcs = ' '.join(src_files)
-  out = ' -o ' + out_file[0]
-
-  supported_cuda_compute_capabilities = [ "3.0", "6.0" ]
-  nvccopts = '-D_FORCE_INLINES '
-  for capability in supported_cuda_compute_capabilities:
-    capability = capability.replace('.', '')
-    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s,compute_%s\" ' % (
-        capability, capability, capability)
-  nvccopts += ' ' + nvcc_compiler_options
-  nvccopts += undefines
-  nvccopts += defines
-  nvccopts += std_options
-  nvccopts += m_options
-  nvccopts += warning_options
-
-  if depfiles:
-    # Generate the dependency file
-    depfile = depfiles[0]
-    cmd = (NVCC_PATH + ' ' + nvccopts +
-           ' --compiler-options "' + host_compiler_options + '"' +
-           ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
-           ' -I .' +
-           ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
-    if log: Log(cmd)
-    exit_status = os.system(cmd)
-    if exit_status != 0:
-      return exit_status
-
-  cmd = (NVCC_PATH + ' ' + nvccopts +
-         ' --compiler-options "' + host_compiler_options + ' -fPIC"' +
-         ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
-         ' -I .' +
-         ' -x cu ' + opt + includes + ' -c ' + srcs + out)
-
-  # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
-  # Need to investigate and fix.
-  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
-  if log: Log(cmd)
-  return os.system(cmd)
-
-
-def main():
-  parser = ArgumentParser()
-  parser.add_argument('-x', nargs=1)
-  parser.add_argument('--cuda_log', action='store_true')
-  args, leftover = parser.parse_known_args(sys.argv[1:])
-
-  if args.x and args.x[0] == 'cuda':
-    if args.cuda_log: Log('-x cuda')
-    leftover = [pipes.quote(s) for s in leftover]
-    if args.cuda_log: Log('using nvcc')
-    return InvokeNvcc(leftover, log=args.cuda_log)
-
-  # Strip our flags before passing through to the CPU compiler for files which
-  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
-  # We not only want to pass -x to the CPU compiler, but also keep it in its
-  # relative location in the argv list (the compiler is actually sensitive to
-  # this).
-  cpu_compiler_flags = [flag for flag in sys.argv[1:]
-                             if not flag.startswith(('--cuda_log'))]
-
-  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.bat b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.bat
deleted file mode 100755
index e896e654fd7..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.bat
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-:: Invoke msvc_wrapper_for_nvcc.py, which is located in the same directory.
-@echo OFF
-set arg0=%~0
-for %%F in ("%arg0%") do set DRIVER_BIN=%%~dpF
-"/usr/bin/python3" -B "%DRIVER_BIN%\msvc_wrapper_for_nvcc.py" %*
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
deleted file mode 100755
index 72354b133a9..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
+++ /dev/null
@@ -1,192 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Crosstool wrapper for compiling CUDA programs with nvcc on Windows.
-
-DESCRIPTION:
-  This script is the Windows version of //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc
-"""
-
-from __future__ import print_function
-
-from argparse import ArgumentParser
-import os
-import subprocess
-import re
-import sys
-import pipes
-
-# Template values set by cuda_autoconf.
-CPU_COMPILER = ('/usr/bin/gcc')
-GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
-
-NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc'
-NVCC_VERSION = '10.0'
-NVCC_TEMP_DIR = "C:\\Windows\\Temp\\nvcc_inter_files_tmp_dir"
-supported_cuda_compute_capabilities = [ "3.0", "6.0" ]
-
-def Log(s):
-  print('gpus/crosstool: {0}'.format(s))
-
-
-def GetOptionValue(argv, option):
-  """Extract the list of values for option from options.
-
-  Args:
-    option: The option whose value to extract, without the leading '/'.
-
-  Returns:
-    1. A list of values, either directly following the option,
-    (eg., /opt val1 val2) or values collected from multiple occurrences of
-    the option (eg., /opt val1 /opt val2).
-    2. The leftover options.
-  """
-
-  parser = ArgumentParser(prefix_chars='/')
-  parser.add_argument('/' + option, nargs='*', action='append')
-  args, leftover = parser.parse_known_args(argv)
-  if args and vars(args)[option]:
-    return (sum(vars(args)[option], []), leftover)
-  return ([], leftover)
-
-def _update_options(nvcc_options):
-  if NVCC_VERSION in ("7.0",):
-    return nvcc_options
-
-  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
-  return [ update_options[opt] if opt in update_options else opt
-                    for opt in nvcc_options ]
-
-def GetNvccOptions(argv):
-  """Collect the -nvcc_options values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    1. The string that can be passed directly to nvcc.
-    2. The leftover options.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-nvcc_options', nargs='*', action='append')
-
-  args, leftover = parser.parse_known_args(argv)
-
-  if args.nvcc_options:
-    options = _update_options(sum(args.nvcc_options, []))
-    return (['--' + a for a in options], leftover)
-  return ([], leftover)
-
-
-def InvokeNvcc(argv, log=False):
-  """Call nvcc with arguments assembled from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    log: True if logging is requested.
-
-  Returns:
-    The return value of calling os.system('nvcc ' + args)
-  """
-
-  src_files = [f for f in argv if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
-  if len(src_files) == 0:
-    raise Error('No source files found for cuda compilation.')
-
-  out_file = [ f for f in argv if f.startswith('/Fo') ]
-  if len(out_file) != 1:
-    raise Error('Please specify exactly one output file for cuda compilation.')
-  out = ['-o', out_file[0][len('/Fo'):]]
-
-  nvcc_compiler_options, argv = GetNvccOptions(argv)
-
-  opt_option, argv = GetOptionValue(argv, 'O')
-  opt = ['-g', '-G']
-  if (len(opt_option) > 0 and opt_option[0] != 'd'):
-    opt = ['-O2']
-
-  include_options, argv = GetOptionValue(argv, 'I')
-  includes = ["-I " + include for include in include_options]
-
-  defines, argv = GetOptionValue(argv, 'D')
-  defines = ['-D' + define for define in defines]
-
-  undefines, argv = GetOptionValue(argv, 'U')
-  undefines = ['-U' + define for define in undefines]
-
-  # The rest of the unrecognized options should be passed to host compiler
-  host_compiler_options = [option for option in argv if option not in (src_files + out_file)]
-
-  m_options = ["-m64"]
-
-  nvccopts = ['-D_FORCE_INLINES']
-  for capability in supported_cuda_compute_capabilities:
-    capability = capability.replace('.', '')
-    nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % (
-        capability, capability, capability)]
-  nvccopts += nvcc_compiler_options
-  nvccopts += undefines
-  nvccopts += defines
-  nvccopts += m_options
-  nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"']
-  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
-  # If we don't specify --keep-dir, nvcc will generate intermediate files under TEMP
-  # Put them under NVCC_TEMP_DIR instead, then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
-  # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver
-  # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists.
-  if os.path.isfile(NVCC_TEMP_DIR):
-    os.remove(NVCC_TEMP_DIR)
-  if not os.path.exists(NVCC_TEMP_DIR):
-    os.makedirs(NVCC_TEMP_DIR)
-  nvccopts += ['--keep', '--keep-dir', NVCC_TEMP_DIR]
-  cmd = [NVCC_PATH] + nvccopts
-  if log:
-    Log(cmd)
-  proc = subprocess.Popen(cmd,
-                          stdout=sys.stdout,
-                          stderr=sys.stderr,
-                          env=os.environ.copy(),
-                          shell=True)
-  proc.wait()
-  return proc.returncode
-
-def main():
-  parser = ArgumentParser()
-  parser.add_argument('-x', nargs=1)
-  parser.add_argument('--cuda_log', action='store_true')
-  args, leftover = parser.parse_known_args(sys.argv[1:])
-
-  if args.x and args.x[0] == 'cuda':
-    if args.cuda_log: Log('-x cuda')
-    leftover = [pipes.quote(s) for s in leftover]
-    if args.cuda_log: Log('using nvcc')
-    return InvokeNvcc(leftover, log=args.cuda_log)
-
-  # Strip our flags before passing through to the CPU compiler for files which
-  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
-  # We not only want to pass -x to the CPU compiler, but also keep it in its
-  # relative location in the argv list (the compiler is actually sensitive to
-  # this).
-  cpu_compiler_flags = [flag for flag in sys.argv[1:]
-                             if not flag.startswith(('--cuda_log'))
-                             and not flag.startswith(('-nvcc_options'))]
-
-  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
deleted file mode 100755
index 460c879d32f..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
+++ /dev/null
@@ -1,176 +0,0 @@
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
-# See https://docs.python.org/3/extending/windows.html
-cc_import(
-    name = "python_lib",
-    interface_library = select({
-        ":windows": ":python_import_lib",
-        # A placeholder for Unix platforms which makes --no_build happy.
-        "//conditions:default": "not-existing.lib",
-    }),
-    system_provided = 1,
-)
-
-cc_library(
-    name = "python_headers",
-    hdrs = [":python_include"],
-    includes = ["python_include"],
-    deps = select({
-        ":windows": [":python_lib"],
-        "//conditions:default": [],
-    }),
-)
-
-cc_library(
-    name = "numpy_headers",
-    hdrs = [":numpy_include"],
-    includes = ["numpy_include"],
-)
-
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-genrule(
-    name = "python_include",
-    outs = [
-        "python_include/Python-ast.h",
-        "python_include/Python.h",
-        "python_include/abstract.h",
-        "python_include/accu.h",
-        "python_include/asdl.h",
-        "python_include/ast.h",
-        "python_include/bitset.h",
-        "python_include/bltinmodule.h",
-        "python_include/boolobject.h",
-        "python_include/bytearrayobject.h",
-        "python_include/bytes_methods.h",
-        "python_include/bytesobject.h",
-        "python_include/cellobject.h",
-        "python_include/ceval.h",
-        "python_include/classobject.h",
-        "python_include/code.h",
-        "python_include/codecs.h",
-        "python_include/compile.h",
-        "python_include/complexobject.h",
-        "python_include/datetime.h",
-        "python_include/descrobject.h",
-        "python_include/dictobject.h",
-        "python_include/dtoa.h",
-        "python_include/dynamic_annotations.h",
-        "python_include/enumobject.h",
-        "python_include/errcode.h",
-        "python_include/eval.h",
-        "python_include/fileobject.h",
-        "python_include/fileutils.h",
-        "python_include/floatobject.h",
-        "python_include/frameobject.h",
-        "python_include/funcobject.h",
-        "python_include/genobject.h",
-        "python_include/graminit.h",
-        "python_include/grammar.h",
-        "python_include/import.h",
-        "python_include/intrcheck.h",
-        "python_include/iterobject.h",
-        "python_include/listobject.h",
-        "python_include/longintrepr.h",
-        "python_include/longobject.h",
-        "python_include/marshal.h",
-        "python_include/memoryobject.h",
-        "python_include/metagrammar.h",
-        "python_include/methodobject.h",
-        "python_include/modsupport.h",
-        "python_include/moduleobject.h",
-        "python_include/namespaceobject.h",
-        "python_include/node.h",
-        "python_include/object.h",
-        "python_include/objimpl.h",
-        "python_include/opcode.h",
-        "python_include/osdefs.h",
-        "python_include/parsetok.h",
-        "python_include/patchlevel.h",
-        "python_include/pgen.h",
-        "python_include/pgenheaders.h",
-        "python_include/py_curses.h",
-        "python_include/pyarena.h",
-        "python_include/pyatomic.h",
-        "python_include/pycapsule.h",
-        "python_include/pyconfig.h",
-        "python_include/pyctype.h",
-        "python_include/pydebug.h",
-        "python_include/pyerrors.h",
-        "python_include/pyexpat.h",
-        "python_include/pyfpe.h",
-        "python_include/pygetopt.h",
-        "python_include/pyhash.h",
-        "python_include/pymacconfig.h",
-        "python_include/pymacro.h",
-        "python_include/pymath.h",
-        "python_include/pymem.h",
-        "python_include/pyport.h",
-        "python_include/pystate.h",
-        "python_include/pystrcmp.h",
-        "python_include/pystrtod.h",
-        "python_include/pythonrun.h",
-        "python_include/pythread.h",
-        "python_include/pytime.h",
-        "python_include/rangeobject.h",
-        "python_include/setobject.h",
-        "python_include/sliceobject.h",
-        "python_include/structmember.h",
-        "python_include/structseq.h",
-        "python_include/symtable.h",
-        "python_include/sysmodule.h",
-        "python_include/token.h",
-        "python_include/traceback.h",
-        "python_include/tupleobject.h",
-        "python_include/typeslots.h",
-        "python_include/ucnhash.h",
-        "python_include/unicodeobject.h",
-        "python_include/warnings.h",
-        "python_include/weakrefobject.h",
-    ],
-    cmd = """
-cp -f "/usr/include/python3.4m/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp -f "/usr/include/python3.4m/Python.h" "$(@D)/python_include/Python.h" && cp -f "/usr/include/python3.4m/abstract.h" "$(@D)/python_include/abstract.h" && cp -f "/usr/include/python3.4m/accu.h" "$(@D)/python_include/accu.h" && cp -f "/usr/include/python3.4m/asdl.h" "$(@D)/python_include/asdl.h" && cp -f "/usr/include/python3.4m/ast.h" "$(@D)/python_include/ast.h" && cp -f "/usr/include/python3.4m/bitset.h" "$(@D)/python_include/bitset.h" && cp -f "/usr/include/python3.4m/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp -f "/usr/include/python3.4m/boolobject.h" "$(@D)/python_include/boolobject.h" && cp -f "/usr/include/python3.4m/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp -f "/usr/include/python3.4m/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp -f "/usr/include/python3.4m/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp -f "/usr/include/python3.4m/cellobject.h" "$(@D)/python_include/cellobject.h" && cp -f "/usr/include/python3.4m/ceval.h" "$(@D)/python_include/ceval.h" && cp -f "/usr/include/python3.4m/classobject.h" "$(@D)/python_include/classobject.h" && cp -f "/usr/include/python3.4m/code.h" "$(@D)/python_include/code.h" && cp -f "/usr/include/python3.4m/codecs.h" "$(@D)/python_include/codecs.h" && cp -f "/usr/include/python3.4m/compile.h" "$(@D)/python_include/compile.h" && cp -f "/usr/include/python3.4m/complexobject.h" "$(@D)/python_include/complexobject.h" && cp -f "/usr/include/python3.4m/datetime.h" "$(@D)/python_include/datetime.h" && cp -f "/usr/include/python3.4m/descrobject.h" "$(@D)/python_include/descrobject.h" && cp -f "/usr/include/python3.4m/dictobject.h" "$(@D)/python_include/dictobject.h" && cp -f "/usr/include/python3.4m/dtoa.h" "$(@D)/python_include/dtoa.h" && cp -f "/usr/include/python3.4m/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp -f "/usr/include/python3.4m/enumobject.h" "$(@D)/python_include/enumobject.h" && cp -f "/usr/include/python3.4m/errcode.h" "$(@D)/python_include/errcode.h" && cp -f "/usr/include/python3.4m/eval.h" "$(@D)/python_include/eval.h" && cp -f "/usr/include/python3.4m/fileobject.h" "$(@D)/python_include/fileobject.h" && cp -f "/usr/include/python3.4m/fileutils.h" "$(@D)/python_include/fileutils.h" && cp -f "/usr/include/python3.4m/floatobject.h" "$(@D)/python_include/floatobject.h" && cp -f "/usr/include/python3.4m/frameobject.h" "$(@D)/python_include/frameobject.h" && cp -f "/usr/include/python3.4m/funcobject.h" "$(@D)/python_include/funcobject.h" && cp -f "/usr/include/python3.4m/genobject.h" "$(@D)/python_include/genobject.h" && cp -f "/usr/include/python3.4m/graminit.h" "$(@D)/python_include/graminit.h" && cp -f "/usr/include/python3.4m/grammar.h" "$(@D)/python_include/grammar.h" && cp -f "/usr/include/python3.4m/import.h" "$(@D)/python_include/import.h" && cp -f "/usr/include/python3.4m/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp -f "/usr/include/python3.4m/iterobject.h" "$(@D)/python_include/iterobject.h" && cp -f "/usr/include/python3.4m/listobject.h" "$(@D)/python_include/listobject.h" && cp -f "/usr/include/python3.4m/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp -f "/usr/include/python3.4m/longobject.h" "$(@D)/python_include/longobject.h" && cp -f "/usr/include/python3.4m/marshal.h" "$(@D)/python_include/marshal.h" && cp -f "/usr/include/python3.4m/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp -f "/usr/include/python3.4m/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp -f "/usr/include/python3.4m/methodobject.h" "$(@D)/python_include/methodobject.h" && cp -f "/usr/include/python3.4m/modsupport.h" "$(@D)/python_include/modsupport.h" && cp -f "/usr/include/python3.4m/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp -f "/usr/include/python3.4m/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp -f "/usr/include/python3.4m/node.h" "$(@D)/python_include/node.h" && cp -f "/usr/include/python3.4m/object.h" "$(@D)/python_include/object.h" && cp -f "/usr/include/python3.4m/objimpl.h" "$(@D)/python_include/objimpl.h" && cp -f "/usr/include/python3.4m/opcode.h" "$(@D)/python_include/opcode.h" && cp -f "/usr/include/python3.4m/osdefs.h" "$(@D)/python_include/osdefs.h" && cp -f "/usr/include/python3.4m/parsetok.h" "$(@D)/python_include/parsetok.h" && cp -f "/usr/include/python3.4m/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp -f "/usr/include/python3.4m/pgen.h" "$(@D)/python_include/pgen.h" && cp -f "/usr/include/python3.4m/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp -f "/usr/include/python3.4m/py_curses.h" "$(@D)/python_include/py_curses.h" && cp -f "/usr/include/python3.4m/pyarena.h" "$(@D)/python_include/pyarena.h" && cp -f "/usr/include/python3.4m/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp -f "/usr/include/python3.4m/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp -f "/usr/include/python3.4m/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp -f "/usr/include/python3.4m/pyctype.h" "$(@D)/python_include/pyctype.h" && cp -f "/usr/include/python3.4m/pydebug.h" "$(@D)/python_include/pydebug.h" && cp -f "/usr/include/python3.4m/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp -f "/usr/include/python3.4m/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp -f "/usr/include/python3.4m/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp -f "/usr/include/python3.4m/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp -f "/usr/include/python3.4m/pyhash.h" "$(@D)/python_include/pyhash.h" && cp -f "/usr/include/python3.4m/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp -f "/usr/include/python3.4m/pymacro.h" "$(@D)/python_include/pymacro.h" && cp -f "/usr/include/python3.4m/pymath.h" "$(@D)/python_include/pymath.h" && cp -f "/usr/include/python3.4m/pymem.h" "$(@D)/python_include/pymem.h" && cp -f "/usr/include/python3.4m/pyport.h" "$(@D)/python_include/pyport.h" && cp -f "/usr/include/python3.4m/pystate.h" "$(@D)/python_include/pystate.h" && cp -f "/usr/include/python3.4m/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp -f "/usr/include/python3.4m/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp -f "/usr/include/python3.4m/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp -f "/usr/include/python3.4m/pythread.h" "$(@D)/python_include/pythread.h" && cp -f "/usr/include/python3.4m/pytime.h" "$(@D)/python_include/pytime.h" && cp -f "/usr/include/python3.4m/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp -f "/usr/include/python3.4m/setobject.h" "$(@D)/python_include/setobject.h" && cp -f "/usr/include/python3.4m/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp -f "/usr/include/python3.4m/structmember.h" "$(@D)/python_include/structmember.h" && cp -f "/usr/include/python3.4m/structseq.h" "$(@D)/python_include/structseq.h" && cp -f "/usr/include/python3.4m/symtable.h" "$(@D)/python_include/symtable.h" && cp -f "/usr/include/python3.4m/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp -f "/usr/include/python3.4m/token.h" "$(@D)/python_include/token.h" && cp -f "/usr/include/python3.4m/traceback.h" "$(@D)/python_include/traceback.h" && cp -f "/usr/include/python3.4m/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp -f "/usr/include/python3.4m/typeslots.h" "$(@D)/python_include/typeslots.h" && cp -f "/usr/include/python3.4m/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp -f "/usr/include/python3.4m/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp -f "/usr/include/python3.4m/warnings.h" "$(@D)/python_include/warnings.h" && cp -f "/usr/include/python3.4m/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
-   """,
-)
-
-genrule(
-    name = "numpy_include",
-    outs = [
-        "numpy_include/numpy/__multiarray_api.h",
-        "numpy_include/numpy/__ufunc_api.h",
-        "numpy_include/numpy/_neighborhood_iterator_imp.h",
-        "numpy_include/numpy/_numpyconfig.h",
-        "numpy_include/numpy/arrayobject.h",
-        "numpy_include/numpy/arrayscalars.h",
-        "numpy_include/numpy/halffloat.h",
-        "numpy_include/numpy/multiarray_api.txt",
-        "numpy_include/numpy/ndarrayobject.h",
-        "numpy_include/numpy/ndarraytypes.h",
-        "numpy_include/numpy/noprefix.h",
-        "numpy_include/numpy/npy_1_7_deprecated_api.h",
-        "numpy_include/numpy/npy_3kcompat.h",
-        "numpy_include/numpy/npy_common.h",
-        "numpy_include/numpy/npy_cpu.h",
-        "numpy_include/numpy/npy_endian.h",
-        "numpy_include/numpy/npy_interrupt.h",
-        "numpy_include/numpy/npy_math.h",
-        "numpy_include/numpy/npy_no_deprecated_api.h",
-        "numpy_include/numpy/npy_os.h",
-        "numpy_include/numpy/numpyconfig.h",
-        "numpy_include/numpy/old_defines.h",
-        "numpy_include/numpy/oldnumeric.h",
-        "numpy_include/numpy/ufunc_api.txt",
-        "numpy_include/numpy/ufuncobject.h",
-        "numpy_include/numpy/utils.h",
-    ],
-    cmd = """
-cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
-   """,
-)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/py3/WORKSPACE b/third_party/toolchains/preconfig/ubuntu14.04/py3/WORKSPACE
deleted file mode 100644
index 1d298fefa3b..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/py3/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for python_configure rule
-workspace(name = "local_config_python")
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
deleted file mode 100755
index 88980d1014a..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
+++ /dev/null
@@ -1,63 +0,0 @@
-# NVIDIA TensorRT
-# A high-performance deep learning inference optimizer and runtime.
-
-licenses(["notice"])
-
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
-load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
-
-package(default_visibility = ["//visibility:public"])
-
-exports_files(["LICENSE"])
-
-cc_library(
-    name = "tensorrt_headers",
-    hdrs = [
-        "tensorrt/include/tensorrt_config.h",
-        ":tensorrt_include",
-    ],
-    include_prefix = "third_party/tensorrt",
-    strip_include_prefix = "tensorrt/include",
-)
-
-cc_library(
-    name = "tensorrt",
-    srcs = [":tensorrt_lib"],
-    copts = cuda_default_copts(),
-    data = [":tensorrt_lib"],
-    linkstatic = 1,
-    deps = [
-        ":tensorrt_headers",
-        "@local_config_cuda//cuda",
-    ],
-)
-
-bzl_library(
-    name = "build_defs_bzl",
-    srcs = ["build_defs.bzl"],
-    deps = [
-        "@bazel_skylib//lib:selects",
-    ],
-)
-
-genrule(
-    name = "tensorrt_lib",
-    outs = [
-        "tensorrt/lib/libnvinfer.so.5",
-        "tensorrt/lib/libnvinfer_plugin.so.5",
-    ],
-    cmd = """cp -f "/usr/lib/x86_64-linux-gnu/libnvinfer.so.5" "$(location tensorrt/lib/libnvinfer.so.5)" && \
-cp -f "/usr/lib/x86_64-linux-gnu/libnvinfer_plugin.so.5" "$(location tensorrt/lib/libnvinfer_plugin.so.5)" """,
-)
-
-genrule(
-    name = "tensorrt_include",
-    outs = [
-        "tensorrt/include/NvInfer.h",
-        "tensorrt/include/NvUtils.h",
-        "tensorrt/include/NvInferPlugin.h",
-    ],
-    cmd = """cp -f "/usr/include/x86_64-linux-gnu/NvInfer.h" "$(location tensorrt/include/NvInfer.h)" && \
-cp -f "/usr/include/x86_64-linux-gnu/NvUtils.h" "$(location tensorrt/include/NvUtils.h)" && \
-cp -f "/usr/include/x86_64-linux-gnu/NvInferPlugin.h" "$(location tensorrt/include/NvInferPlugin.h)" """,
-)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/LICENSE b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/LICENSE
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/WORKSPACE b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/WORKSPACE
deleted file mode 100644
index ce47f14b91b..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for tensorrt_configure rule
-workspace(name = "local_config_tensorrt")
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl
deleted file mode 100755
index 527be938341..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl
+++ /dev/null
@@ -1,5 +0,0 @@
-# Build configurations for TensorRT.
-
-def if_tensorrt(if_true, if_false = []):
-    """Tests whether TensorRT was enabled during the configure process."""
-    return if_true
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/tensorrt/include/tensorrt_config.h b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/tensorrt/include/tensorrt_config.h
deleted file mode 100644
index 02a166f4cd1..00000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/tensorrt/include/tensorrt_config.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORRT_TENSORRT_INCLUDE_CONFIG_H_
-#define TENSORRT_TENSORRT_INCLUDE_CONFIG_H_
-
-#define TF_TENSORRT_VERSION "5"
-
-#endif  // TENSORRT_TENSORRT_INCLUDE_CONFIG_H_

From 9b53275852f37fb1f48e22ed99237f1baef761c5 Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Fri, 21 Feb 2020 02:23:16 -0800
Subject: [PATCH 440/442] Remove left-over debugging.

PiperOrigin-RevId: 296396471
Change-Id: I5478bef24e53a2eb80f4d84774c8bfa58273fc6f
---
 .../compiler/mlir/xla/transforms/materialize_broadcasts.cc       | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc b/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
index fbaab534565..13467be41d9 100644
--- a/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
@@ -164,7 +164,6 @@ std::vector<Value> ComputeBroadcastedShape(SrcOp op, Value small, Value large,
           loc, rewriter->getIntegerAttr(rewriter->getIndexType(), 1));
       DimOp lrg_dim = rewriter->create<mlir::DimOp>(loc, large, i);
       DimOp sml_dim = rewriter->create<mlir::DimOp>(loc, small, indexes[i]);
-      sml_dim.dump();
       CmpIOp compare =
           rewriter->create<mlir::CmpIOp>(loc, CmpIPredicate::eq, lrg_dim, one);
       index_value =

From 45c98a790e93df5da33fed97d3844ac63a9868da Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Fri, 21 Feb 2020 03:30:50 -0800
Subject: [PATCH 441/442] Add support to lower element-wise HLO functions to
 LHLO on dynamic shapes.

This uses hand-written shape derivation functions and is not complete. The goal is to start experimenting with these and get some first code through.

PiperOrigin-RevId: 296403560
Change-Id: I996cc5f862604ca76344076cb6699998757d4164
---
 tensorflow/compiler/mlir/xla/BUILD            |  15 +-
 .../mlir/xla/tests/hlo-legalize-to-lhlo.mlir  |  84 ++++++++++-
 .../xla/transforms/hlo_legalize_to_lhlo.cc    |  23 +++-
 .../xla/transforms/hlo_shape_derivation.h     | 130 ++++++++++++++++++
 4 files changed, 247 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/xla/transforms/hlo_shape_derivation.h

diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index df3ffd0599c..bf2d8103872 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -142,6 +142,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "hlo_shape_derivation",
+    srcs = [],
+    hdrs = ["transforms/hlo_shape_derivation.h"],
+    deps = [
+        ":hlo",
+        ":lhlo",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
 cc_library(
     name = "lhlo_legalize_to_affine",
     srcs = ["transforms/lhlo_legalize_to_affine.cc"],
@@ -218,9 +231,9 @@ cc_library(
     srcs = ["transforms/hlo_legalize_to_lhlo.cc"],
     deps = [
         ":hlo",
+        ":hlo_shape_derivation",
         ":lhlo",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:StandardOps",
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
index 4b2d76e586a..be6f0e6a949 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt -hlo-legalize-to-lhlo -lhlo-redundant-copies-removal %s -o - | FileCheck %s --dump-input=always
+// RUN: tf-opt -hlo-legalize-to-lhlo -lhlo-redundant-copies-removal -split-input-file %s -o - | FileCheck %s -dump-input-on-failure
 
 // CHECK-LABEL: func @attrs
 func @attrs_copy(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
@@ -11,6 +11,8 @@ func @attrs_copy(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @func_op
 func @func_op(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
   // CHECK: (%[[NEW_ARG0:.*]]: memref<4xf32>, %[[NEW_ARG1:.*]]: memref<4xf32>, %[[RESULT:.*]]: memref<4xf32>)
@@ -20,6 +22,8 @@ func @func_op(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
   // CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
 }
 
+// -----
+
 // CHECK-LABEL: func @func_op_long
 func @func_op_long(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
   // CHECK: (%[[NEW_ARG0:.*]]: memref<4xf32>, %[[NEW_ARG1:.*]]: memref<4xf32>, %[[RESULT:.*]]: memref<4xf32>)
@@ -45,6 +49,8 @@ func @func_op_long(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32>
   // CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
 }
 
+// -----
+
 // CHECK-LABEL: func @remove_lhlo_copy_op_created_from_tensor_store
 func @remove_lhlo_copy_op_created_from_tensor_store(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: memref<f32>) {
   %0 = "xla_hlo.max"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
@@ -58,6 +64,8 @@ func @remove_lhlo_copy_op_created_from_tensor_store(%arg0: tensor<f32>, %arg1: t
 // CHECK-NOT: dealloc %[[ALLOC_OPERAND]] : memref<f32>
 // CHECK: "xla_lhlo.terminator"() : () -> ()
 
+// -----
+
 // CHECK-LABEL: func @fusion
 func @fusion(%multiplier: memref<2x2xf32>, %summand_1: memref<2x2xf32>,
              %summand_2: memref<2x2xf32>, %result: memref<2x2xf32>) {
@@ -77,6 +85,8 @@ func @fusion(%multiplier: memref<2x2xf32>, %summand_1: memref<2x2xf32>,
   "xla_lhlo.terminator"() : () -> ()
 }
 
+// -----
+
 // CHECK-LABEL: func @copy
 func @copy(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
@@ -87,6 +97,8 @@ func @copy(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @exp
 func @exp(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
@@ -97,6 +109,8 @@ func @exp(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @select
 func @select(%pred: memref<2x2xi1>, %lhs: memref<2x2xf32>,
              %rhs: memref<2x2xf32>, %result: memref<2x2xf32>) {
@@ -110,6 +124,8 @@ func @select(%pred: memref<2x2xi1>, %lhs: memref<2x2xf32>,
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @compare
 func @compare(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>, %result: memref<2x2xi1>) {
   %tensor_lhs = tensor_load %lhs : memref<2x2xf32>
@@ -122,6 +138,8 @@ func @compare(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>, %result: memref<2x2x
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @broadcast
 func @broadcast(%operand: memref<5xf32>, %result: memref<10x5xf32>) {
   %tensor_operand = tensor_load %operand : memref<5xf32>
@@ -133,6 +151,8 @@ func @broadcast(%operand: memref<5xf32>, %result: memref<10x5xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @dyn_broadcast
 func @dyn_broadcast(%operand: memref<?x?xf32>) {
   %tensor_operand = tensor_load %operand : memref<?x?xf32>
@@ -157,6 +177,8 @@ func @dyn_broadcast(%operand: memref<?x?xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @iota
 func @iota(%result: memref<10xi32>) {
   %tensor_result = "xla_hlo.iota"()
@@ -166,6 +188,8 @@ func @iota(%result: memref<10xi32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @abs
 func @abs(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
@@ -176,6 +200,8 @@ func @abs(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @ceil
 func @ceil(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
@@ -186,6 +212,8 @@ func @ceil(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @convert
 func @convert(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
@@ -196,6 +224,8 @@ func @convert(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @cos
 func @cos(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
@@ -206,6 +236,8 @@ func @cos(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @neg
 func @neg(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
@@ -216,6 +248,8 @@ func @neg(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @sign
 func @sign(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
@@ -226,6 +260,8 @@ func @sign(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @tanh
 func @tanh(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
@@ -236,6 +272,8 @@ func @tanh(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @remainder
 func @remainder(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_lhs = tensor_load %lhs : memref<2x2xf32>
@@ -246,3 +284,47 @@ func @remainder(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>, %result: memref<2x
   tensor_store %tensor_result, %result : memref<2x2xf32>
   return
 }
+
+// -----
+
+// Dynamic shape binary element-wise operation.
+// CHECK-LABEL: func @add_dyn
+func @add_dyn(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>) {
+  %result = "xla_hlo.add"(%lhs, %rhs)
+      : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
+  // CHECK: %[[DIM0:.*]] = dim %arg0, 0 : memref<?x?xf32>
+  // CHECK: %[[IC0:.*]] = index_cast %[[DIM0]] : index to i64
+  // CHECK: %[[DIM1:.*]] = dim %arg0, 1 : memref<?x?xf32>
+  // CHECK: %[[IC1:.*]] = index_cast %[[DIM1]] : index to i64
+  // CHECK: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[IC0]], %[[IC1]]) : (i64, i64) -> tensor<2xi64>
+  // CHECK: %[[C0:.*]] = constant 0 : index
+  // CHECK: %[[EE0:.*]] = extract_element %[[SHAPE]][%[[C0]]] : tensor<2xi64>
+  // CHECK: %[[ICS0:.*]] = index_cast %[[EE0]] : i64 to index
+  // CHECK: %[[EE1:.*]] = extract_element %[[SHAPE]][%[[C1]]] : tensor<2xi64>
+  // CHECK: %[[ICS1:.*]] = index_cast %[[EE1]] : i64 to index
+  // CHECK: %[[RESULT:.*]] = alloc(%[[ICS0]], %[[ICS1]])
+  // CHECK: "xla_lhlo.add"(%arg0, %arg1, %[[RESULT]]) : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
+  return
+}
+
+// -----
+
+// Dynamic shape unary element-wise operation.
+// CHECK-LABEL: func @tanh_dyn
+func @tanh_dyn(%arg0: tensor<?x?xf32>) {
+  %result = "xla_hlo.tanh"(%arg0)
+      : (tensor<?x?xf32>) -> tensor<?x?xf32>
+  // CHECK: %[[DIM0:.*]] = dim %arg0, 0 : memref<?x?xf32>
+  // CHECK: %[[IC0:.*]] = index_cast %[[DIM0]] : index to i64
+  // CHECK: %[[DIM1:.*]] = dim %arg0, 1 : memref<?x?xf32>
+  // CHECK: %[[IC1:.*]] = index_cast %[[DIM1]] : index to i64
+  // CHECK: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[IC0]], %[[IC1]]) : (i64, i64) -> tensor<2xi64>
+  // CHECK: %[[C0:.*]] = constant 0 : index
+  // CHECK: %[[EE0:.*]] = extract_element %[[SHAPE]][%[[C0]]] : tensor<2xi64>
+  // CHECK: %[[ICS0:.*]] = index_cast %[[EE0]] : i64 to index
+  // CHECK: %[[EE1:.*]] = extract_element %[[SHAPE]][%[[C1]]] : tensor<2xi64>
+  // CHECK: %[[ICS1:.*]] = index_cast %[[EE1]] : i64 to index
+  // CHECK: %[[RESULT:.*]] = alloc(%[[ICS0]], %[[ICS1]])
+  // CHECK: "xla_lhlo.tanh"(%arg0, %[[RESULT]]) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
+  return
+}
diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
index 77c361a8ab5..1384abed91c 100644
--- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"  // TF:llvm-project
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/transforms/hlo_shape_derivation.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
 #include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
 
@@ -127,9 +128,24 @@ class HloToLhloOpConverter : public ConversionPattern {
       ConversionPatternRewriter& rewriter) const final {
     const auto& original_results = op->getResults();
     SmallVector<Value, 4> buffer_args(operands.begin(), operands.end());
-    for (auto result : original_results) {
-      buffer_args.push_back(
-          InsertAllocAndDealloc(op->getLoc(), result, &rewriter));
+    for (auto result : llvm::enumerate(original_results)) {
+      RankedTensorType resultType =
+          result.value().getType().dyn_cast<RankedTensorType>();
+      if (!resultType) {
+        return matchFailure();
+      }
+      if (resultType.hasStaticShape()) {
+        buffer_args.push_back(
+            InsertAllocAndDealloc(op->getLoc(), result.value(), &rewriter));
+      } else {
+        Value shape_value = ShapeDerivation<HloOpTy>::impl::deriveShapeFromOp(
+            op, result.index(), &rewriter);
+        if (!shape_value) {
+          return matchFailure();
+        }
+        buffer_args.push_back(InsertDynamicAllocAndDealloc(
+            op->getLoc(), result.value(), shape_value, &rewriter));
+      }
     }
     rewriter.create<LhloOpTy>(op->getLoc(), llvm::None, buffer_args,
                               op->getAttrs());
@@ -320,6 +336,7 @@ struct HloLegalizeToLhlo : public ModulePass<HloLegalizeToLhlo> {
     target.addIllegalOp<mlir::TensorLoadOp>();
     target.addIllegalOp<mlir::TensorStoreOp>();
     target.addLegalOp<ModuleTerminatorOp>();
+    target.addLegalOp<ScalarsToDimensionTensorOp>();
     target.addIllegalDialect<xla_hlo::XlaHloDialect>();
     target.addDynamicallyLegalOp<FuncOp>([&](FuncOp op) {
       auto inputs = op.getType().getInputs();
diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_shape_derivation.h b/tensorflow/compiler/mlir/xla/transforms/hlo_shape_derivation.h
new file mode 100644
index 00000000000..7c6d162632f
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_shape_derivation.h
@@ -0,0 +1,130 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_HLO_SHAPE_DERIVATION_H_
+#define TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_HLO_SHAPE_DERIVATION_H_
+
+#include "mlir/Dialect/StandardOps/Ops.h"  // TF:llvm-project
+#include "mlir/IR/Attributes.h"  // TF:llvm-project
+#include "mlir/IR/Builders.h"  // TF:llvm-project
+#include "mlir/IR/Location.h"  // TF:llvm-project
+#include "mlir/IR/MLIRContext.h"  // TF:llvm-project
+#include "mlir/IR/Operation.h"  // TF:llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // TF:llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
+
+namespace mlir {
+namespace xla_hlo {
+
+// This file contains implementations for shape derivation functions that,
+// given some operation and a result number, produce IR that computes the
+// shape of the given result at runtime based on operands of the provided
+// operation.
+// These should be generated at some point based on annotations on the HLO
+// using the new shape dialect. While this is still in the works, we hardcode
+// the expected IR here to unblock progress.
+// The implementation is based on templates to allow for using these derivation
+// functions in templated code.
+
+namespace impl {
+
+struct UnknownShape {
+  // Default shape derivation function that simply fails with a runtime error.
+  static Value deriveShapeFromOp(Operation* op, int operand_position,
+                                 ConversionPatternRewriter* rewriter) {
+    op->emitOpError()
+        << "dynamic result shapes cannot be derived for this operation";
+    return {};
+  }
+};
+
+struct SameShapeAsFirstOperand {
+  // Shape derivation function that computes the shape of the result based on
+  // the first argument. For a 2-dimensional input tensor, this produces IR of
+  // the form
+  //
+  //  %0 = dim %arg0, 0 : memref<?x?xf32>
+  //  %1 = index_cast %0 : index to i64
+  //  %2 = dim %arg0, 1 : memref<?x?xf32>
+  //  %3 = index_cast %2 : index to i64
+  //  %4 = "xla_hlo.scalars_to_dimension_tensor"(%1, %3)
+  //    : (i64, i64) -> tensor<2xi64>
+  //
+  // and returns %4 as the shape value.
+  static Value deriveShapeFromOp(Operation* op, int result_postion,
+                                 ConversionPatternRewriter* rewriter) {
+    Value operand = op->getOperand(0);
+    ShapedType operand_type = operand.getType().dyn_cast<ShapedType>();
+    if (!operand_type) {
+      op->emitOpError() << "first operand has no shaped type";
+      return {};
+    }
+    auto loc = op->getLoc();
+    SmallVector<Value, 4> shape_values;
+    shape_values.reserve(operand_type.getRank());
+    auto shape_scalar_type = rewriter->getIntegerType(64);
+    for (auto element : llvm::enumerate(operand_type.getShape())) {
+      if (element.value() == ShapedType::kDynamicSize) {
+        Value dim = rewriter->create<DimOp>(loc, operand, element.index());
+        shape_values.push_back(
+            rewriter->create<IndexCastOp>(loc, dim, shape_scalar_type));
+      } else {
+        shape_values.push_back(rewriter->create<ConstantOp>(
+            loc, rewriter->getI64IntegerAttr(element.value())));
+      }
+    }
+    return rewriter->create<ScalarsToDimensionTensorOp>(
+        loc, RankedTensorType::get({operand_type.getRank()}, shape_scalar_type),
+        shape_values);
+  }
+};
+
+}  // namespace impl
+
+// Default template to cover HLO operations whose shape derivation is unknown.
+template <typename HloOpTy>
+struct ShapeDerivation {
+  using impl = impl::UnknownShape;
+};
+
+// Element-wise operations that have the shape of their first operand.
+
+#define SAME_SHAPE_AS_FIRST_OPERAND(Op)         \
+  template <>                                   \
+  struct ShapeDerivation<Op> {                  \
+    using impl = impl::SameShapeAsFirstOperand; \
+  };
+
+SAME_SHAPE_AS_FIRST_OPERAND(AbsOp)
+SAME_SHAPE_AS_FIRST_OPERAND(AddOp)
+SAME_SHAPE_AS_FIRST_OPERAND(AndOp)
+SAME_SHAPE_AS_FIRST_OPERAND(CeilOp)
+SAME_SHAPE_AS_FIRST_OPERAND(CosOp)
+SAME_SHAPE_AS_FIRST_OPERAND(DivOp)
+SAME_SHAPE_AS_FIRST_OPERAND(ExpOp)
+SAME_SHAPE_AS_FIRST_OPERAND(MaxOp)
+SAME_SHAPE_AS_FIRST_OPERAND(MinOp)
+SAME_SHAPE_AS_FIRST_OPERAND(MulOp)
+SAME_SHAPE_AS_FIRST_OPERAND(NegOp)
+SAME_SHAPE_AS_FIRST_OPERAND(RemOp)
+SAME_SHAPE_AS_FIRST_OPERAND(SubOp)
+SAME_SHAPE_AS_FIRST_OPERAND(TanhOp)
+
+#undef SAME_SHAPE_AS_FIRST_OPERAND
+
+}  // namespace xla_hlo
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_HLO_SHAPE_DERIVATION_H_

From f781e5eb44d3ece0ca82627e7cf4c87ad95e4620 Mon Sep 17 00:00:00 2001
From: Stefano Galarraga <galarragas@google.com>
Date: Fri, 21 Feb 2020 03:54:58 -0800
Subject: [PATCH 442/442] Support partition limitation even when accelerator
 name is not specified

PiperOrigin-RevId: 296406206
Change-Id: I4bdd9d0e9b578401bb380828c628e14ef5aa711c
---
 tensorflow/lite/delegates/nnapi/BUILD         |   1 +
 .../lite/delegates/nnapi/nnapi_delegate.cc    |  32 ++--
 .../nnapi_delegate_device_selection_test.cc   | 152 ++++++++++++++----
 3 files changed, 142 insertions(+), 43 deletions(-)

diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index ee47ad0e24d..021f74b6ab0 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -179,6 +179,7 @@ cc_test(
         ":nnapi_delegate",
         ":nnapi_delegate_mock_test",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:test_util",
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index a3a4babd91f..84b3cfb2f15 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -4185,25 +4185,29 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
       .version = 1,
   };
 
-  std::vector<int>& nodes_to_delegate = supported_nodes;
+  std::vector<int> nodes_to_delegate;
+
+  int num_partitions;
+  TfLiteDelegateParams* params_array;
   if (is_accelerator_specified) {
-    std::vector<int> device_supported_nodes;
-    int num_partitions;
-    TfLiteDelegateParams* params_array;
-
+    // Filtering out nodes not supported by target accelerators
     TF_LITE_ENSURE_STATUS(GetNodesSupportedByAccelerator(
-        context, delegate, nnapi, supported_nodes, &device_supported_nodes,
+        context, delegate, nnapi, supported_nodes, &nodes_to_delegate,
         &num_partitions, &params_array, nnapi_errno));
-
-    TF_LITE_ENSURE_STATUS(LimitDelegatedPartitions(
-        delegate_options.max_number_delegated_partitions,
-        std::vector<TfLiteDelegateParams>(params_array,
-                                          params_array + num_partitions),
-        &device_supported_nodes));
-
-    nodes_to_delegate = device_supported_nodes;
+  } else {
+    nodes_to_delegate = supported_nodes;
+    auto supported_nodes_int_array = BuildTfLiteIntArray(supported_nodes);
+    TF_LITE_ENSURE_STATUS(context->PreviewDelegatePartitioning(
+        context, supported_nodes_int_array.get(), &params_array,
+        &num_partitions));
   }
 
+  TF_LITE_ENSURE_STATUS(
+      LimitDelegatedPartitions(delegate_options.max_number_delegated_partitions,
+                               std::vector<TfLiteDelegateParams>(
+                                   params_array, params_array + num_partitions),
+                               &nodes_to_delegate));
+
   if (nodes_to_delegate.empty()) {
     return kTfLiteOk;
   } else {
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
index bf9e00bee69..d6183e63013 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include <sys/mman.h>
 
 #include <algorithm>
+#include <array>
 #include <iterator>
 #include <memory>
 #include <numeric>
@@ -23,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h"
@@ -545,13 +547,39 @@ TEST_F(UnsupportedOperationOnDeviceTest, ShouldCacheModelCompilation) {
 }
 
 // Model with a chain of no-op (add with zero operations)
+// interleaved with no-op custom nodes.
 class LongIdentityModel : public MultiOpModel, public AcceleratedModel {
  public:
   LongIdentityModel(const std::vector<int>& input_shape, int graph_size,
+                    const std::unordered_set<int>& custom_nodes_indexes,
                     const NnApi* nnapi, const std::string& accelerator_name,
                     int max_nnapi_partitions)
       : MultiOpModel(),
         AcceleratedModel(nnapi, accelerator_name, max_nnapi_partitions) {
+    Init(input_shape, graph_size, custom_nodes_indexes);
+  }
+
+  LongIdentityModel(const std::vector<int>& input_shape, int graph_size,
+                    const std::unordered_set<int>& custom_nodes_indexes,
+                    const NnApi* nnapi, int max_nnapi_partitions)
+      : MultiOpModel(), AcceleratedModel(nnapi, false, max_nnapi_partitions) {
+    Init(input_shape, graph_size, custom_nodes_indexes);
+  }
+
+  void SetInput(std::vector<float> value) { PopulateTensor(input_, value); }
+
+  int CountNnApiPartitions() {
+    return std::count_if(
+        std::begin(interpreter_->execution_plan()),
+        std::end(interpreter_->execution_plan()), [this](const int node_index) {
+          return interpreter_->node_and_registration(node_index)
+                     ->first.delegate != nullptr;
+        });
+  }
+
+ private:
+  void Init(const std::vector<int>& input_shape, int graph_size,
+            const std::unordered_set<int>& custom_nodes_indexes) {
     auto* delegate = GetDelegate();
     this->SetApplyDelegate([delegate](Interpreter* interpreter) {
       interpreter->ModifyGraphWithDelegate(delegate);
@@ -574,10 +602,15 @@ class LongIdentityModel : public MultiOpModel, public AcceleratedModel {
                  {intermediate_outputs[0]});
 
     for (int i = 0; i < intermediate_outputs.size() - 1; i++) {
-      AddBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
-                   CreateAddOptions(builder_).Union(),
-                   {intermediate_outputs[i], zero_input_},
-                   {intermediate_outputs[i + 1]});
+      if (custom_nodes_indexes.count(i + 1) == 1) {
+        AddCustomOp("custom_no_op", {}, [this]() { return CustomNoOpNode(); },
+                    {intermediate_outputs[i]}, {intermediate_outputs[i + 1]});
+      } else {
+        AddBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
+                     CreateAddOptions(builder_).Union(),
+                     {intermediate_outputs[i], zero_input_},
+                     {intermediate_outputs[i + 1]});
+      }
     }
 
     AddBuiltinOp(
@@ -592,18 +625,42 @@ class LongIdentityModel : public MultiOpModel, public AcceleratedModel {
     PopulateTensor(zero_input_, zero);
   }
 
-  void SetInput(std::vector<float> value) { PopulateTensor(input_, value); }
+  // Return the registration of a custom node simply copying input to output.
+  TfLiteRegistration* CustomNoOpNode() {
+    static TfLiteRegistration no_op = {
+        .init = [](TfLiteContext* context, const char* buffer,
+                   size_t length) -> void* { return nullptr; },
 
-  int CountNnApiPartitions() {
-    return std::count_if(
-        std::begin(interpreter_->execution_plan()),
-        std::end(interpreter_->execution_plan()), [this](const int node_index) {
-          return interpreter_->node_and_registration(node_index)
-                     ->first.delegate != nullptr;
-        });
+        .free = [](TfLiteContext* context, void* buffer) -> void {},
+
+        .prepare = [](TfLiteContext* context,
+                      TfLiteNode* node) -> TfLiteStatus {
+          if (node->inputs->size != 1 || node->outputs->size != 1) {
+            return kTfLiteError;
+          }
+
+          return kTfLiteOk;
+        },
+
+        .invoke = [](TfLiteContext* context, TfLiteNode* node) -> TfLiteStatus {
+          auto input_tensor = context->tensors[node->inputs->data[0]];
+          auto output_tensor = context->tensors[node->outputs->data[0]];
+
+          std::copy(input_tensor.data.raw,
+                    input_tensor.data.raw + input_tensor.bytes,
+                    output_tensor.data.raw);
+
+          return kTfLiteOk;
+        },
+
+        .profiling_string = nullptr,
+        .builtin_code = kTfLiteBuiltinDelegate,
+        .custom_name = "NoOpTestDelegate",
+        .version = 1,
+    };
+
+    return &no_op;
   }
-
- private:
   int input_;
   int zero_input_;
   int output_;
@@ -643,7 +700,8 @@ class DelegatePartitionLimitTest
   // input_shape.
   void Init(int max_nnapi_partitions,
             const std::vector<int>& nnapi_partition_sizes,
-            const std::vector<int>& input_shape) {
+            const std::vector<int>& input_shape,
+            bool specify_accelerator = true) {
     // The graph will have as number of nodes the sum of nodes in the NNAPI
     // partitions plus nnapi_partition_sizes.size() - 1 nodes that will be
     // not supported by NNAPI and will cause the
@@ -658,20 +716,36 @@ class DelegatePartitionLimitTest
       unsupported_ops_idxs.insert(partition_node_idx);
     }
 
-    DelegatePartitionLimitTestNodeFilter()->ConfigureSupportedNodes(
-        graph_size_, unsupported_ops_idxs);
+    if (specify_accelerator) {
+      // Building a model that will contain initially a single partition
+      // and will get then partitioned by checking the operations supported
+      // by the target accelerator.
+      // This because I am not able to know the size of each partition in my
+      // stubbed GetSupportedOperationsForDevices API.
+      DelegatePartitionLimitTestNodeFilter()->ConfigureSupportedNodes(
+          graph_size_, unsupported_ops_idxs);
 
-    nnapi_mock_->StubGetSupportedOperationsForDevicesWith(
-        [](const ANeuralNetworksModel* model,
-           const ANeuralNetworksDevice* const* devices, uint32_t num_devices,
-           bool* supported_ops) -> int {
-          DelegatePartitionLimitTestNodeFilter()->SetNodeSupport(supported_ops);
-          return ANEURALNETWORKS_NO_ERROR;
-        });
+      nnapi_mock_->StubGetSupportedOperationsForDevicesWith(
+          [](const ANeuralNetworksModel* model,
+             const ANeuralNetworksDevice* const* devices, uint32_t num_devices,
+             bool* supported_ops) -> int {
+            DelegatePartitionLimitTestNodeFilter()->SetNodeSupport(
+                supported_ops);
+            return ANEURALNETWORKS_NO_ERROR;
+          });
 
-    model_ = std::make_unique<LongIdentityModel>(
-        input_shape, graph_size_, nnapi_mock_->GetNnApi(),
-        /*accelerator_name=*/"test-device", max_nnapi_partitions);
+      model_ = std::make_unique<LongIdentityModel>(
+          input_shape, graph_size_,
+          /*custom_nodes_indexes=*/std::unordered_set<int>(),
+          nnapi_mock_->GetNnApi(),
+          /*accelerator_name=*/"test-device", max_nnapi_partitions);
+    } else {
+      // Building a model containing custom nodes that won't be supported
+      // by the delegate and generate the partitions.
+      model_ = std::make_unique<LongIdentityModel>(
+          input_shape, graph_size_, unsupported_ops_idxs,
+          nnapi_mock_->GetNnApi(), max_nnapi_partitions);
+    }
   }
 
   std::unique_ptr<LongIdentityModel> model_;
@@ -718,24 +792,44 @@ TEST_F(DelegatePartitionLimitTest,
 }
 
 TEST_F(DelegatePartitionLimitTest, ShouldDelegatePartitionWithHigherNodeCount) {
+  int kLargestModelSize = 3;
   Init(/*max_nnapi_partitions=*/1,
        /*nnapi_partition_sizes=*/{3, 2},
        /*input_shape=*/{1, 2, 2, 1});
 
   EXPECT_EQ(model_->CountNnApiPartitions(), 1);
-  EXPECT_EQ(model_->CountOpsExecutedByCpuKernel(), OriginalGraphSize() - 3);
+  EXPECT_EQ(model_->CountOpsExecutedByCpuKernel(),
+            OriginalGraphSize() - kLargestModelSize);
 }
 
 TEST_F(DelegatePartitionLimitTest,
        ShouldDelegatePartitionsWithHigherNodeCount) {
+  int kLargestModelSize = 5;
+  int kSecondLargestModelSize = 4;
   Init(/*max_nnapi_partitions=*/2,
-       /*nnapi_partition_sizes=*/{1, 5, 2, 4},
+       /*nnapi_partition_sizes=*/
+       {1, kLargestModelSize, 2, kSecondLargestModelSize},
        /*input_shape=*/{1, 2, 2, 1});
 
   EXPECT_EQ(model_->CountNnApiPartitions(), 2);
   EXPECT_EQ(model_->CountOpsExecutedByCpuKernel(), OriginalGraphSize() - 9);
 }
 
+TEST_F(DelegatePartitionLimitTest,
+       ShouldLimitPartitionsEvenWithoutAcceleratorNameSpecified) {
+  int kLargestModelSize = 5;
+  int kSecondLargestModelSize = 4;
+  Init(/*max_nnapi_partitions=*/2,
+       /*nnapi_partition_sizes=*/
+       {1, kLargestModelSize, 2, kSecondLargestModelSize},
+       /*input_shape=*/{1, 2, 2, 1}, /*specify_accelerator=*/false);
+
+  EXPECT_EQ(model_->CountNnApiPartitions(), 2);
+  EXPECT_EQ(
+      model_->CountOpsExecutedByCpuKernel(),
+      OriginalGraphSize() - (kLargestModelSize + kSecondLargestModelSize));
+}
+
 }  // namespace
 }  // namespace tflite